Merge https://github.com/openmm/openmm

5a06df78 · tic20 · 8dd60914 · a9223eea · 8dd60914 · 8dd60914
Commit 5a06df78 authored Mar 04, 2020 by tic20
20 changed files
--- a/platforms/cuda/src/kernels/customCentroidBond.cu
+++ b/platforms/cuda/src/kernels/customCentroidBond.cu
-/**
- * Compute the center of each group.
- */
-extern "C" __global__ void computeGroupCenters(const real4* __restrict__ posq, const int* __restrict__ groupParticles,
-        const real* __restrict__ groupWeights, const int* __restrict__ groupOffsets, real4* __restrict__ centerPositions) {
-    __shared__ volatile real3 temp[64];
-    for (int group = blockIdx.x; group < NUM_GROUPS; group += gridDim.x) {
-        // The threads in this block work together to compute the center one group.
-        
-        int firstIndex = groupOffsets[group];
-        int lastIndex = groupOffsets[group+1];
-        real3 center = make_real3(0, 0, 0);
-        for (int index = threadIdx.x; index < lastIndex-firstIndex; index += blockDim.x) {
-            int atom = groupParticles[firstIndex+index];
-            real weight = groupWeights[firstIndex+index];
-            real4 pos = posq[atom];
-            center.x += weight*pos.x;
-            center.y += weight*pos.y;
-            center.z += weight*pos.z;
-        }
-        
-        // Sum the values.
-        
-        int thread = threadIdx.x;
-        temp[thread].x = center.x;
-        temp[thread].y = center.y;
-        temp[thread].z = center.z;
-        __syncthreads();
-        if (thread < 32) {
-            temp[thread].x += temp[thread+32].x;
-            temp[thread].y += temp[thread+32].y;
-            temp[thread].z += temp[thread+32].z;
-            if (thread < 16) {
-                temp[thread].x += temp[thread+16].x;
-                temp[thread].y += temp[thread+16].y;
-                temp[thread].z += temp[thread+16].z;
-            }
-            if (thread < 8) {
-                temp[thread].x += temp[thread+8].x;
-                temp[thread].y += temp[thread+8].y;
-                temp[thread].z += temp[thread+8].z;
-            }
-            if (thread < 4) {
-                temp[thread].x += temp[thread+4].x;
-                temp[thread].y += temp[thread+4].y;
-                temp[thread].z += temp[thread+4].z;
-            }
-            if (thread < 2) {
-                temp[thread].x += temp[thread+2].x;
-                temp[thread].y += temp[thread+2].y;
-                temp[thread].z += temp[thread+2].z;
-            }
-        }
-        if (thread == 0)
-            centerPositions[group] = make_real4(temp[0].x+temp[1].x, temp[0].y+temp[1].y, temp[0].z+temp[1].z, 0);
-    }
-}
-
-/**
- * Convert a real4 to a real3 by removing its last element.
- */
-inline __device__ real3 trim(real4 v) {
-    return make_real3(v.x, v.y, v.z);
-}
-
-/**
- * Compute the difference between two vectors, setting the fourth component to the squared magnitude.
- */
-inline __device__ real4 delta(real4 vec1, real4 vec2, bool periodic, real4 periodicBoxSize, real4 invPeriodicBoxSize, 
-        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ) {
-    real4 result = make_real4(vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0);
-    if (periodic)
-        APPLY_PERIODIC_TO_DELTA(result);
-    result.w = result.x*result.x + result.y*result.y + result.z*result.z;
-    return result;
-}
-
-/**
- * Compute the angle between two vectors.  The w component of each vector should contain the squared magnitude.
- */
-__device__ real computeAngle(real4 vec1, real4 vec2) {
-    real dotProduct = vec1.x*vec2.x + vec1.y*vec2.y + vec1.z*vec2.z;
-    real cosine = dotProduct*RSQRT(vec1.w*vec2.w);
-    real angle;
-    if (cosine > 0.99f || cosine < -0.99f) {
-        // We're close to the singularity in acos(), so take the cross product and use asin() instead.
-
-        real3 crossProduct = cross(vec1, vec2);
-        real scale = vec1.w*vec2.w;
-        angle = ASIN(SQRT(dot(crossProduct, crossProduct)/scale));
-        if (cosine < 0.0f)
-            angle = M_PI-angle;
-    }
-    else
-       angle = ACOS(cosine);
-    return angle;
-}
-
-/**
- * Compute the cross product of two vectors, setting the fourth component to the squared magnitude.
- */
-inline __device__ real4 computeCross(real4 vec1, real4 vec2) {
-    real3 cp = cross(vec1, vec2);
-    return make_real4(cp.x, cp.y, cp.z, cp.x*cp.x+cp.y*cp.y+cp.z*cp.z);
-}
-
-/**
- * Compute the forces on groups based on the bonds.
- */
-extern "C" __global__ void computeGroupForces(unsigned long long* __restrict__ groupForce, mixed* __restrict__ energyBuffer, const real4* __restrict__ centerPositions,
-        const int* __restrict__ bondGroups, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
-        EXTRA_ARGS) {
-    mixed energy = 0;
-    INIT_PARAM_DERIVS
-    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_BONDS; index += blockDim.x*gridDim.x) {
-        COMPUTE_FORCE
-    }
-    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
-    SAVE_PARAM_DERIVS
-}
-
-/**
- * Apply the forces from the group centers to the individual atoms.
- */
-extern "C" __global__ void applyForcesToAtoms(const int* __restrict__ groupParticles, const real* __restrict__ groupWeights, const int* __restrict__ groupOffsets,
-        const long long* __restrict__ groupForce, unsigned long long* __restrict__ atomForce) {
-    for (int group = blockIdx.x; group < NUM_GROUPS; group += gridDim.x) {
-        long long fx = groupForce[group];
-        long long fy = groupForce[group+NUM_GROUPS];
-        long long fz = groupForce[group+NUM_GROUPS*2];
-        int firstIndex = groupOffsets[group];
-        int lastIndex = groupOffsets[group+1];
-        for (int index = threadIdx.x; index < lastIndex-firstIndex; index += blockDim.x) {
-            int atom = groupParticles[firstIndex+index];
-            real weight = groupWeights[firstIndex+index];
-            atomicAdd(&atomForce[atom], static_cast<unsigned long long>((long long) (fx*weight)));
-            atomicAdd(&atomForce[atom+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (fy*weight)));
-            atomicAdd(&atomForce[atom+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (fz*weight)));
-        }
-    }
-}
--- a/platforms/cuda/src/kernels/customGBEnergyN2.cu
+++ b/platforms/cuda/src/kernels/customGBEnergyN2.cu
-#define STORE_DERIVATIVE_1(INDEX) atomicAdd(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (deriv##INDEX##_1*0x100000000)));
-#define STORE_DERIVATIVE_2(INDEX) atomicAdd(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].deriv##INDEX*0x100000000)));
-
-typedef struct {
-    real3 pos;
-    real3 force;
-    ATOM_PARAMETER_DATA
-#ifdef NEED_PADDING
-    float padding;
-#endif
-} AtomData;
-
-/**
- * Compute a force based on pair interactions.
- */
-extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forceBuffers, mixed* __restrict__ energyBuffer,
-        const real4* __restrict__ posq, const unsigned int* __restrict__ exclusions, const ushort2* __restrict__ exclusionTiles, bool needEnergy,
-#ifdef USE_CUTOFF
-        const int* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
-        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, const real4* __restrict__ blockCenter,
-        const real4* __restrict__ blockSize, const unsigned int* __restrict__ interactingAtoms
-#else
-        unsigned int numTiles
-#endif
-        PARAMETER_ARGUMENTS) {
-    const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
-    const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
-    const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
-    const unsigned int tbx = threadIdx.x - tgx;
-    mixed energy = 0;
-    INIT_PARAM_DERIVS
-    __shared__ AtomData localData[THREAD_BLOCK_SIZE];
-
-    // First loop: process tiles that contain exclusions.
-    
-    const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
-    const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
-    for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
-        const ushort2 tileIndices = exclusionTiles[pos];
-        const unsigned int x = tileIndices.x;
-        const unsigned int y = tileIndices.y;
-        real3 force = make_real3(0);
-        DECLARE_ATOM1_DERIVATIVES
-        unsigned int atom1 = x*TILE_SIZE + tgx;
-        real4 pos1 = posq[atom1];
-        LOAD_ATOM1_PARAMETERS
-#ifdef USE_EXCLUSIONS
-        unsigned int excl = exclusions[pos*TILE_SIZE+tgx];
-#endif
-        if (x == y) {
-            // This tile is on the diagonal.
-
-            const unsigned int localAtomIndex = threadIdx.x;
-            localData[localAtomIndex].pos = make_real3(pos1.x, pos1.y, pos1.z);
-            LOAD_LOCAL_PARAMETERS_FROM_1
-            for (unsigned int j = 0; j < TILE_SIZE; j++) {
-                int atom2 = tbx+j;
-                real3 pos2 = localData[atom2].pos;
-                real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
-#ifdef USE_PERIODIC
-                APPLY_PERIODIC_TO_DELTA(delta)
-#endif
-                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-#ifdef USE_CUTOFF
-                if (r2 < CUTOFF_SQUARED) {
-#endif
-                    real invR = RSQRT(r2);
-                    real r = r2*invR;
-                    LOAD_ATOM2_PARAMETERS
-                    atom2 = y*TILE_SIZE+j;
-                    real dEdR = 0;
-                    real tempEnergy = 0;
-                    const real interactionScale = 0.5f;
-#ifdef USE_EXCLUSIONS
-                    bool isExcluded = !(excl & 0x1);
-#endif
-                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
-                        COMPUTE_INTERACTION
-                        dEdR /= -r;
-                    }
-                    if (needEnergy)
-                        energy += 0.5f*tempEnergy;
-                    delta *= dEdR;
-                    force.x -= delta.x;
-                    force.y -= delta.y;
-                    force.z -= delta.z;
-#ifdef USE_CUTOFF
-                }
-#endif
-#ifdef USE_EXCLUSIONS
-                excl >>= 1;
-#endif
-            }
-        }
-        else {
-            // This is an off-diagonal tile.
-
-            const unsigned int localAtomIndex = threadIdx.x;
-            unsigned int j = y*TILE_SIZE + tgx;
-            real4 tempPosq = posq[j];
-            localData[localAtomIndex].pos = make_real3(tempPosq.x, tempPosq.y, tempPosq.z);
-            LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
-            localData[localAtomIndex].force = make_real3(0);
-            CLEAR_LOCAL_DERIVATIVES
-#ifdef USE_EXCLUSIONS
-            excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
-#endif
-            unsigned int tj = tgx;
-            for (j = 0; j < TILE_SIZE; j++) {
-                int atom2 = tbx+tj;
-                real3 pos2 = localData[atom2].pos;
-                real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
-#ifdef USE_PERIODIC
-                APPLY_PERIODIC_TO_DELTA(delta)
-#endif
-                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-#ifdef USE_CUTOFF
-                if (r2 < CUTOFF_SQUARED) {
-#endif
-                    real invR = RSQRT(r2);
-                    real r = r2*invR;
-                    LOAD_ATOM2_PARAMETERS
-                    atom2 = y*TILE_SIZE+tj;
-                    real dEdR = 0;
-                    real tempEnergy = 0;
-                    const real interactionScale = 1;
-#ifdef USE_EXCLUSIONS
-                    bool isExcluded = !(excl & 0x1);
-#endif
-                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
-                        COMPUTE_INTERACTION
-                        dEdR /= -r;
-                    }
-                    if (needEnergy)
-                        energy += tempEnergy;
-                    delta *= dEdR;
-                    force.x -= delta.x;
-                    force.y -= delta.y;
-                    force.z -= delta.z;
-                    atom2 = tbx+tj;
-                    localData[atom2].force.x += delta.x;
-                    localData[atom2].force.y += delta.y;
-                    localData[atom2].force.z += delta.z;
-                    RECORD_DERIVATIVE_2
-#ifdef USE_CUTOFF
-                }
-#endif
-#ifdef USE_EXCLUSIONS
-                excl >>= 1;
-#endif
-                tj = (tj + 1) & (TILE_SIZE - 1);
-            }
-        }
-
-        // Write results.
-
-        unsigned int offset = x*TILE_SIZE + tgx;
-        atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
-        atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
-        atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
-        STORE_DERIVATIVES_1
-        if (x != y) {
-            offset = y*TILE_SIZE + tgx;
-            atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
-            atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
-            atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
-            STORE_DERIVATIVES_2
-        }
-    }
-
-    // Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
-    // of them (no cutoff).
-
-#ifdef USE_CUTOFF
-    unsigned int numTiles = interactionCount[0];
-    if (numTiles > maxTiles)
-        return; // There wasn't enough memory for the neighbor list.
-    int pos = (int) (warp*(numTiles > maxTiles ? NUM_BLOCKS*((long long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
-    int end = (int) ((warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
-#else
-    int pos = (int) (warp*(long long)numTiles/totalWarps);
-    int end = (int) ((warp+1)*(long long)numTiles/totalWarps);
-#endif
-    int skipBase = 0;
-    int currentSkipIndex = tbx;
-    __shared__ int atomIndices[THREAD_BLOCK_SIZE];
-    __shared__ volatile int skipTiles[THREAD_BLOCK_SIZE];
-    skipTiles[threadIdx.x] = -1;
-    
-    while (pos < end) {
-        const bool isExcluded = false;
-        real3 force = make_real3(0);
-        DECLARE_ATOM1_DERIVATIVES
-        bool includeTile = true;
-        
-        // Extract the coordinates of this tile.
-        
-        int x, y;
-        bool singlePeriodicCopy = false;
-#ifdef USE_CUTOFF
-            x = tiles[pos];
-            real4 blockSizeX = blockSize[x];
-            singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= CUTOFF &&
-                                  0.5f*periodicBoxSize.y-blockSizeX.y >= CUTOFF &&
-                                  0.5f*periodicBoxSize.z-blockSizeX.z >= CUTOFF);
-#else
-        y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
-        x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-        if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
-            y += (x < y ? -1 : 1);
-            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-        }
-
-        // Skip over tiles that have exclusions, since they were already processed.
-
-        while (skipTiles[tbx+TILE_SIZE-1] < pos) {
-            if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
-                ushort2 tile = exclusionTiles[skipBase+tgx];
-                skipTiles[threadIdx.x] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
-            }
-            else
-                skipTiles[threadIdx.x] = end;
-            skipBase += TILE_SIZE;            
-            currentSkipIndex = tbx;
-        }
-        while (skipTiles[currentSkipIndex] < pos)
-            currentSkipIndex++;
-        includeTile = (skipTiles[currentSkipIndex] != pos);
-#endif
-        if (includeTile) {
-            unsigned int atom1 = x*TILE_SIZE + tgx;
-
-            // Load atom data for this tile.
-
-            real4 pos1 = posq[atom1];
-            LOAD_ATOM1_PARAMETERS
-            const unsigned int localAtomIndex = threadIdx.x;
-#ifdef USE_CUTOFF
-            unsigned int j = interactingAtoms[pos*TILE_SIZE+tgx];
-#else
-            unsigned int j = y*TILE_SIZE + tgx;
-#endif
-            atomIndices[threadIdx.x] = j;
-            if (j < PADDED_NUM_ATOMS) {
-                real4 tempPosq = posq[j];
-                localData[localAtomIndex].pos = make_real3(tempPosq.x, tempPosq.y, tempPosq.z);
-                LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
-                localData[localAtomIndex].force = make_real3(0);
-                CLEAR_LOCAL_DERIVATIVES
-            }
-#ifdef USE_PERIODIC
-            if (singlePeriodicCopy) {
-                // The box is small enough that we can just translate all the atoms into a single periodic
-                // box, then skip having to apply periodic boundary conditions later.
-
-                real4 blockCenterX = blockCenter[x];
-                APPLY_PERIODIC_TO_POS_WITH_CENTER(pos1, blockCenterX)
-                APPLY_PERIODIC_TO_POS_WITH_CENTER(localData[threadIdx.x].pos, blockCenterX)
-                unsigned int tj = tgx;
-                for (j = 0; j < TILE_SIZE; j++) {
-                    int atom2 = tbx+tj;
-                    real3 pos2 = localData[atom2].pos;
-                    real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
-                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-#ifdef USE_CUTOFF
-                    if (r2 < CUTOFF_SQUARED) {
-#endif
-                        real invR = RSQRT(r2);
-                        real r = r2*invR;
-                        LOAD_ATOM2_PARAMETERS
-                        atom2 = atomIndices[tbx+tj];
-                        real dEdR = 0;
-                        real tempEnergy = 0;
-                        const real interactionScale = 1;
-                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
-                            COMPUTE_INTERACTION
-                            dEdR /= -r;
-                        }
-                        if (needEnergy)
-                            energy += tempEnergy;
-                        delta *= dEdR;
-                        force.x -= delta.x;
-                        force.y -= delta.y;
-                        force.z -= delta.z;
-                        atom2 = tbx+tj;
-                        localData[atom2].force.x += delta.x;
-                        localData[atom2].force.y += delta.y;
-                        localData[atom2].force.z += delta.z;
-                        RECORD_DERIVATIVE_2
-#ifdef USE_CUTOFF
-                    }
-#endif
-                    tj = (tj + 1) & (TILE_SIZE - 1);
-                }
-            }
-            else
-#endif
-            {
-                // We need to apply periodic boundary conditions separately for each interaction.
-
-                unsigned int tj = tgx;
-                for (j = 0; j < TILE_SIZE; j++) {
-                    int atom2 = tbx+tj;
-                    real3 pos2 = localData[atom2].pos;
-                    real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
-#ifdef USE_PERIODIC
-                    APPLY_PERIODIC_TO_DELTA(delta)
-#endif
-                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-#ifdef USE_CUTOFF
-                    if (r2 < CUTOFF_SQUARED) {
-#endif
-                        real invR = RSQRT(r2);
-                        real r = r2*invR;
-                        LOAD_ATOM2_PARAMETERS
-                        atom2 = atomIndices[tbx+tj];
-                        real dEdR = 0;
-                        real tempEnergy = 0;
-                        const real interactionScale = 1;
-                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
-                            COMPUTE_INTERACTION
-                            dEdR /= -r;
-                        }
-                        if (needEnergy)
-                            energy += tempEnergy;
-                        delta *= dEdR;
-                        force.x -= delta.x;
-                        force.y -= delta.y;
-                        force.z -= delta.z;
-                        atom2 = tbx+tj;
-                        localData[atom2].force.x += delta.x;
-                        localData[atom2].force.y += delta.y;
-                        localData[atom2].force.z += delta.z;
-                        RECORD_DERIVATIVE_2
-#ifdef USE_CUTOFF
-                    }
-#endif
-                    tj = (tj + 1) & (TILE_SIZE - 1);
-                }
-            }
-        
-            // Write results.
-
-            atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
-            atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
-            atomicAdd(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
-            unsigned int offset = atom1;
-            STORE_DERIVATIVES_1
-#ifdef USE_CUTOFF
-            unsigned int atom2 = atomIndices[threadIdx.x];
-#else
-            unsigned int atom2 = y*TILE_SIZE + tgx;
-#endif
-            if (atom2 < PADDED_NUM_ATOMS) {
-                atomicAdd(&forceBuffers[atom2], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
-                atomicAdd(&forceBuffers[atom2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
-                atomicAdd(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
-                offset = atom2;
-                STORE_DERIVATIVES_2
-            }
-        }
-        pos++;
-    }
-    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
-    SAVE_PARAM_DERIVS
-}
--- a/platforms/cuda/src/kernels/customGBEnergyPerParticle.cu
+++ b/platforms/cuda/src/kernels/customGBEnergyPerParticle.cu
-/**
- * Reduce the derivatives computed in the N^2 energy kernel, and compute all per-particle energy terms.
- */
-
-extern "C" __global__ void computePerParticleEnergy(long long* __restrict__ forceBuffers, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq
-        PARAMETER_ARGUMENTS) {
-    mixed energy = 0;
-    INIT_PARAM_DERIVS
-    for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
-        // Load the derivatives
-
-        LOAD_DERIVATIVES
-
-        // Now calculate the per-particle energy terms.
-
-        real4 pos = posq[index];
-        real3 force = make_real3(0, 0, 0);
-        COMPUTE_ENERGY
-    }
-    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
-    SAVE_PARAM_DERIVS
-}
--- a/platforms/cuda/src/kernels/customGBValueN2.cu
+++ b/platforms/cuda/src/kernels/customGBValueN2.cu
-typedef struct {
-    real3 pos;
-    real value;
-    ATOM_PARAMETER_DATA
-#ifdef NEED_PADDING
-    float padding;
-#endif
-} AtomData;
-
-/**
- * Compute a value based on pair interactions.
- */
-extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const unsigned int* __restrict__ exclusions,
-        const ushort2* __restrict__ exclusionTiles, unsigned long long* __restrict__ global_value,
-#ifdef USE_CUTOFF
-        const int* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
-        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, const real4* __restrict__ blockCenter,
-        const real4* __restrict__ blockSize, const unsigned int* __restrict__ interactingAtoms
-#else
-        unsigned int numTiles
-#endif
-        PARAMETER_ARGUMENTS) {
-    const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
-    const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
-    const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
-    const unsigned int tbx = threadIdx.x - tgx;
-    __shared__ AtomData localData[THREAD_BLOCK_SIZE];
-
-    // First loop: process tiles that contain exclusions.
-    
-    const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
-    const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
-    for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
-        const ushort2 tileIndices = exclusionTiles[pos];
-        const unsigned int x = tileIndices.x;
-        const unsigned int y = tileIndices.y;
-        real value = 0;
-        unsigned int atom1 = x*TILE_SIZE + tgx;
-        real4 pos1 = posq[atom1];
-        LOAD_ATOM1_PARAMETERS
-#ifdef USE_EXCLUSIONS
-        unsigned int excl = exclusions[pos*TILE_SIZE+tgx];
-#endif
-        if (x == y) {
-            // This tile is on the diagonal.
-
-            const unsigned int localAtomIndex = threadIdx.x;
-            localData[localAtomIndex].pos = make_real3(pos1.x, pos1.y, pos1.z);
-            LOAD_LOCAL_PARAMETERS_FROM_1
-            for (unsigned int j = 0; j < TILE_SIZE; j++) {
-                int atom2 = tbx+j;
-                real3 pos2 = localData[atom2].pos;
-                real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
-#ifdef USE_PERIODIC
-                APPLY_PERIODIC_TO_DELTA(delta)
-#endif
-                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-#ifdef USE_CUTOFF
-                if (r2 < CUTOFF_SQUARED) {
-#endif
-                    real invR = RSQRT(r2);
-                    real r = r2*invR;
-                    LOAD_ATOM2_PARAMETERS
-                    atom2 = y*TILE_SIZE+j;
-                    real tempValue1 = 0;
-                    real tempValue2 = 0;
-#ifdef USE_EXCLUSIONS
-                    bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS || !(excl & 0x1));
-                    if (!isExcluded && atom1 != atom2) {
-#else
-                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
-#endif
-                        COMPUTE_VALUE
-                    }
-                    value += tempValue1;
-                    ADD_TEMP_DERIVS1
-#ifdef USE_CUTOFF
-                }
-#endif
-#ifdef USE_EXCLUSIONS
-                excl >>= 1;
-#endif
-            }
-        }
-        else {
-            // This is an off-diagonal tile.
-
-            const unsigned int localAtomIndex = threadIdx.x;
-            unsigned int j = y*TILE_SIZE + tgx;
-            real4 tempPosq = posq[j];
-            localData[localAtomIndex].pos = make_real3(tempPosq.x, tempPosq.y, tempPosq.z);
-            LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
-            localData[localAtomIndex].value = 0;
-#ifdef USE_EXCLUSIONS
-            excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
-#endif
-            unsigned int tj = tgx;
-            for (j = 0; j < TILE_SIZE; j++) {
-                int atom2 = tbx+tj;
-                real3 pos2 = localData[atom2].pos;
-                real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
-#ifdef USE_PERIODIC
-                APPLY_PERIODIC_TO_DELTA(delta)
-#endif
-                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-#ifdef USE_CUTOFF
-                if (r2 < CUTOFF_SQUARED) {
-#endif
-                    real invR = RSQRT(r2);
-                    real r = r2*invR;
-                    LOAD_ATOM2_PARAMETERS
-                    atom2 = y*TILE_SIZE+tj;
-                    real tempValue1 = 0;
-                    real tempValue2 = 0;
-#ifdef USE_EXCLUSIONS
-                    bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS || !(excl & 0x1));
-                    if (!isExcluded) {
-#else
-                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
-#endif
-                        COMPUTE_VALUE
-                    }
-                    value += tempValue1;
-                    localData[tbx+tj].value += tempValue2;
-                    ADD_TEMP_DERIVS1
-                    ADD_TEMP_DERIVS2
-#ifdef USE_CUTOFF
-                }
-#endif
-#ifdef USE_EXCLUSIONS
-                excl >>= 1;
-#endif
-                tj = (tj + 1) & (TILE_SIZE - 1);
-            }
-        }
-
-        // Write results.
-
-        unsigned int offset1 = x*TILE_SIZE + tgx;
-        atomicAdd(&global_value[offset1], static_cast<unsigned long long>((long long) (value*0x100000000)));
-        STORE_PARAM_DERIVS1
-        if (x != y) {
-            unsigned int offset2 = y*TILE_SIZE + tgx;
-            atomicAdd(&global_value[offset2], static_cast<unsigned long long>((long long) (localData[threadIdx.x].value*0x100000000)));
-            STORE_PARAM_DERIVS2
-        }
-    }
-
-    // Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
-    // of them (no cutoff).
-
-#ifdef USE_CUTOFF
-    unsigned int numTiles = interactionCount[0];
-    if (numTiles > maxTiles)
-        return; // There wasn't enough memory for the neighbor list.
-    int pos = (int) (warp*(numTiles > maxTiles ? NUM_BLOCKS*((long long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
-    int end = (int) ((warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
-#else
-    int pos = (int) (warp*(long long)numTiles/totalWarps);
-    int end = (int) ((warp+1)*(long long)numTiles/totalWarps);
-#endif
-    int skipBase = 0;
-    int currentSkipIndex = tbx;
-    __shared__ int atomIndices[THREAD_BLOCK_SIZE];
-    __shared__ volatile int skipTiles[THREAD_BLOCK_SIZE];
-    skipTiles[threadIdx.x] = -1;
-    
-    while (pos < end) {
-        real value = 0;
-        bool includeTile = true;
-        
-        // Extract the coordinates of this tile.
-        
-        int x, y;
-        bool singlePeriodicCopy = false;
-#ifdef USE_CUTOFF
-        x = tiles[pos];
-        real4 blockSizeX = blockSize[x];
-        singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= CUTOFF &&
-                              0.5f*periodicBoxSize.y-blockSizeX.y >= CUTOFF &&
-                              0.5f*periodicBoxSize.z-blockSizeX.z >= CUTOFF);
-#else
-        y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
-        x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-        if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
-            y += (x < y ? -1 : 1);
-            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-        }
-
-        // Skip over tiles that have exclusions, since they were already processed.
-
-        while (skipTiles[tbx+TILE_SIZE-1] < pos) {
-            if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
-                ushort2 tile = exclusionTiles[skipBase+tgx];
-                skipTiles[threadIdx.x] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
-            }
-            else
-                skipTiles[threadIdx.x] = end;
-            skipBase += TILE_SIZE;            
-            currentSkipIndex = tbx;
-        }
-        while (skipTiles[currentSkipIndex] < pos)
-            currentSkipIndex++;
-        includeTile = (skipTiles[currentSkipIndex] != pos);
-#endif
-        if (includeTile) {
-            unsigned int atom1 = x*TILE_SIZE + tgx;
-
-            // Load atom data for this tile.
-            
-            real4 pos1 = posq[atom1];
-            LOAD_ATOM1_PARAMETERS
-            const unsigned int localAtomIndex = threadIdx.x;
-#ifdef USE_CUTOFF
-            unsigned int j = interactingAtoms[pos*TILE_SIZE+tgx];
-#else
-            unsigned int j = y*TILE_SIZE + tgx;
-#endif
-            atomIndices[threadIdx.x] = j;
-            if (j < PADDED_NUM_ATOMS) {
-                real4 tempPosq = posq[j];
-                localData[localAtomIndex].pos = make_real3(tempPosq.x, tempPosq.y, tempPosq.z);
-                LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
-                localData[localAtomIndex].value = 0;
-            }
-#ifdef USE_PERIODIC
-            if (singlePeriodicCopy) {
-                // The box is small enough that we can just translate all the atoms into a single periodic
-                // box, then skip having to apply periodic boundary conditions later.
-
-                real4 blockCenterX = blockCenter[x];
-                APPLY_PERIODIC_TO_POS_WITH_CENTER(pos1, blockCenterX)
-                APPLY_PERIODIC_TO_POS_WITH_CENTER(localData[threadIdx.x].pos, blockCenterX)
-                unsigned int tj = tgx;
-                for (unsigned int j = 0; j < TILE_SIZE; j++) {
-                    int atom2 = tbx+tj;
-                    real3 pos2 = localData[atom2].pos;
-                    real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
-                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-                    if (r2 < CUTOFF_SQUARED) {
-                        real invR = RSQRT(r2);
-                        real r = r2*invR;
-                        LOAD_ATOM2_PARAMETERS
-                        atom2 = atomIndices[tbx+tj];
-                        real tempValue1 = 0;
-                        real tempValue2 = 0;
-                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
-                            COMPUTE_VALUE
-                        }
-                        value += tempValue1;
-                        localData[tbx+tj].value += tempValue2;
-                        ADD_TEMP_DERIVS1
-                        ADD_TEMP_DERIVS2
-                    }
-                    tj = (tj + 1) & (TILE_SIZE - 1);
-                }
-            }
-            else
-#endif
-            {
-                // We need to apply periodic boundary conditions separately for each interaction.
-
-                unsigned int tj = tgx;
-                for (unsigned int j = 0; j < TILE_SIZE; j++) {
-                    int atom2 = tbx+tj;
-                    real3 pos2 = localData[atom2].pos;
-                    real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
-#ifdef USE_PERIODIC
-                    APPLY_PERIODIC_TO_DELTA(delta)
-#endif
-                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-#ifdef USE_CUTOFF
-                    if (r2 < CUTOFF_SQUARED) {
-#endif
-                        real invR = RSQRT(r2);
-                        real r = r2*invR;
-                        LOAD_ATOM2_PARAMETERS
-                        atom2 = atomIndices[tbx+tj];
-                        real tempValue1 = 0;
-                        real tempValue2 = 0;
-                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
-                            COMPUTE_VALUE
-                        }
-                        value += tempValue1;
-                        localData[tbx+tj].value += tempValue2;
-                        ADD_TEMP_DERIVS1
-                        ADD_TEMP_DERIVS2
-#ifdef USE_CUTOFF
-                    }
-#endif
-                    tj = (tj + 1) & (TILE_SIZE - 1);
-                }
-            }
-        
-            // Write results.
-
-            unsigned int offset1 = atom1;
-            atomicAdd(&global_value[offset1], static_cast<unsigned long long>((long long) (value*0x100000000)));
-            STORE_PARAM_DERIVS1
-#ifdef USE_CUTOFF
-            unsigned int atom2 = atomIndices[threadIdx.x];
-#else
-            unsigned int atom2 = y*TILE_SIZE + tgx;
-#endif
-            if (atom2 < PADDED_NUM_ATOMS) {
-                unsigned int offset2 = atom2;
-                atomicAdd(&global_value[offset2], static_cast<unsigned long long>((long long) (localData[threadIdx.x].value*0x100000000)));
-                STORE_PARAM_DERIVS2
-            }
-        }
-        pos++;
-    }
-}
--- a/platforms/cuda/src/kernels/customGBValuePerParticle.cu
+++ b/platforms/cuda/src/kernels/customGBValuePerParticle.cu
-/**
- * Reduce a pairwise computed value, and compute per-particle values.
- */
-
-extern "C" __global__ void computePerParticleValues(real4* posq, long long* valueBuffers
-        PARAMETER_ARGUMENTS) {
-    for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
-        // Load the pairwise value
-
-        real sum = valueBuffers[index]/(real) 0x100000000;
-        REDUCE_PARAM0_DERIV
-        
-        // Now calculate other values
-
-        real4 pos = posq[index];
-        COMPUTE_VALUES
-    }
-}
--- a/platforms/cuda/src/kernels/customHbondForce.cu
+++ b/platforms/cuda/src/kernels/customHbondForce.cu
-/**
- * Convert a real4 to a real3 by removing its last element.
- */
-inline __device__ real3 trim(real4 v) {
-    return make_real3(v.x, v.y, v.z);
-}
-
-/**
- * This does nothing, and just exists to simplify the code generation.
- */
-inline __device__ real3 trim(real3 v) {
-    return v;
-}
-
-/**
- * Compute the difference between two vectors, optionally taking periodic boundary conditions into account
- * and setting the fourth component to the squared magnitude.
- */
-inline __device__ real4 delta(real4 vec1, real4 vec2, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ) {
-    real4 result = make_real4(vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0.0f);
-#ifdef USE_PERIODIC
-    APPLY_PERIODIC_TO_DELTA(result)
-#endif
-    result.w = result.x*result.x + result.y*result.y + result.z*result.z;
-    return result;
-}
-
-/**
- * Compute the angle between two vectors.  The w component of each vector should contain the squared magnitude.
- */
-inline __device__ real computeAngle(real4 vec1, real4 vec2) {
-    real dotProduct = vec1.x*vec2.x + vec1.y*vec2.y + vec1.z*vec2.z;
-    real cosine = dotProduct*RSQRT(vec1.w*vec2.w);
-    real angle;
-    if (cosine > 0.99f || cosine < -0.99f) {
-        // We're close to the singularity in acos(), so take the cross product and use asin() instead.
-
-        real3 crossProduct = cross(vec1, vec2);
-        real scale = vec1.w*vec2.w;
-        angle = ASIN(SQRT(dot(crossProduct, crossProduct)/scale));
-        if (cosine < 0.0f)
-            angle = M_PI-angle;
-    }
-    else
-       angle = ACOS(cosine);
-    return angle;
-}
-
-/**
- * Compute the cross product of two vectors, setting the fourth component to the squared magnitude.
- */
-inline __device__ real4 computeCross(real4 vec1, real4 vec2) {
-    real3 result = cross(vec1, vec2);
-    return make_real4(result.x, result.y, result.z, result.x*result.x + result.y*result.y + result.z*result.z);
-}
-
-/**
- * Compute forces on donors.
- */
-extern "C" __global__ void computeDonorForces(unsigned long long* __restrict__ force, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq,
-        const int4* __restrict__ exclusions, const int4* __restrict__ donorAtoms, const int4* __restrict__ acceptorAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize,
-        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
-        PARAMETER_ARGUMENTS) {
-    extern __shared__ real4 posBuffer[];
-    mixed energy = 0;
-    real3 f1 = make_real3(0);
-    real3 f2 = make_real3(0);
-    real3 f3 = make_real3(0);
-    for (int donorStart = 0; donorStart < NUM_DONORS; donorStart += blockDim.x*gridDim.x) {
-        // Load information about the donor this thread will compute forces on.
-
-        int donorIndex = donorStart+blockIdx.x*blockDim.x+threadIdx.x;
-        int4 atoms, exclusionIndices;
-        real4 d1, d2, d3;
-        if (donorIndex < NUM_DONORS) {
-            atoms = donorAtoms[donorIndex];
-            d1 = (atoms.x > -1 ? posq[atoms.x] : make_real4(0));
-            d2 = (atoms.y > -1 ? posq[atoms.y] : make_real4(0));
-            d3 = (atoms.z > -1 ? posq[atoms.z] : make_real4(0));
-#ifdef USE_EXCLUSIONS
-            exclusionIndices = exclusions[donorIndex];
-#endif
-        }
-        else
-            atoms = make_int4(-1, -1, -1, -1);
-        for (int acceptorStart = 0; acceptorStart < NUM_ACCEPTORS; acceptorStart += blockDim.x) {
-            // Load the next block of acceptors into local memory.
-
-            __syncthreads();
-            int blockSize = min((int) blockDim.x, NUM_ACCEPTORS-acceptorStart);
-            if (threadIdx.x < blockSize) {
-                int4 atoms2 = acceptorAtoms[acceptorStart+threadIdx.x];
-                posBuffer[3*threadIdx.x] = (atoms2.x > -1 ? posq[atoms2.x] : make_real4(0));
-                posBuffer[3*threadIdx.x+1] = (atoms2.y > -1 ? posq[atoms2.y] : make_real4(0));
-                posBuffer[3*threadIdx.x+2] = (atoms2.z > -1 ? posq[atoms2.z] : make_real4(0));
-            }
-            __syncthreads();
-            if (donorIndex < NUM_DONORS) {
-                for (int index = 0; index < blockSize; index++) {
-                    int acceptorIndex = acceptorStart+index;
-#ifdef USE_EXCLUSIONS
-                    if (acceptorIndex == exclusionIndices.x || acceptorIndex == exclusionIndices.y || acceptorIndex == exclusionIndices.z || acceptorIndex == exclusionIndices.w)
-                        continue;
-#endif
-                    // Compute the interaction between a donor and an acceptor.
-
-                    real4 a1 = posBuffer[3*index];
-                    real4 a2 = posBuffer[3*index+1];
-                    real4 a3 = posBuffer[3*index+2];
-                    real4 deltaD1A1 = delta(d1, a1, periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ);
-#ifdef USE_CUTOFF
-                    if (deltaD1A1.w < CUTOFF_SQUARED) {
-#endif
-                        COMPUTE_DONOR_FORCE
-#ifdef USE_CUTOFF
-                    }
-#endif
-                }
-            }
-        }
-
-        // Write results
-
-        if (donorIndex < NUM_DONORS) {
-            if (atoms.x > -1) {
-                atomicAdd(&force[atoms.x], static_cast<unsigned long long>((long long) (f1.x*0x100000000)));
-                atomicAdd(&force[atoms.x+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f1.y*0x100000000)));
-                atomicAdd(&force[atoms.x+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f1.z*0x100000000)));
-                __threadfence_block();
-            }
-            if (atoms.y > -1) {
-                atomicAdd(&force[atoms.y], static_cast<unsigned long long>((long long) (f2.x*0x100000000)));
-                atomicAdd(&force[atoms.y+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f2.y*0x100000000)));
-                atomicAdd(&force[atoms.y+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f2.z*0x100000000)));
-                __threadfence_block();
-            }
-            if (atoms.z > -1) {
-                atomicAdd(&force[atoms.z], static_cast<unsigned long long>((long long) (f3.x*0x100000000)));
-                atomicAdd(&force[atoms.z+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f3.y*0x100000000)));
-                atomicAdd(&force[atoms.z+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f3.z*0x100000000)));
-                __threadfence_block();
-            }
-        }
-    }
-    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
-}
-/**
- * Compute forces on acceptors.
- */
-extern "C" __global__ void computeAcceptorForces(unsigned long long* __restrict__ force, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq,
-        const int4* __restrict__ exclusions, const int4* __restrict__ donorAtoms, const int4* __restrict__ acceptorAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize,
-        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
-        PARAMETER_ARGUMENTS) {
-    extern __shared__ real4 posBuffer[];
-    real3 f1 = make_real3(0);
-    real3 f2 = make_real3(0);
-    real3 f3 = make_real3(0);
-    for (int acceptorStart = 0; acceptorStart < NUM_ACCEPTORS; acceptorStart += blockDim.x*gridDim.x) {
-        // Load information about the acceptor this thread will compute forces on.
-
-        int acceptorIndex = acceptorStart+blockIdx.x*blockDim.x+threadIdx.x;
-        int4 atoms, exclusionIndices;
-        real4 a1, a2, a3;
-        if (acceptorIndex < NUM_ACCEPTORS) {
-            atoms = acceptorAtoms[acceptorIndex];
-            a1 = (atoms.x > -1 ? posq[atoms.x] : make_real4(0));
-            a2 = (atoms.y > -1 ? posq[atoms.y] : make_real4(0));
-            a3 = (atoms.z > -1 ? posq[atoms.z] : make_real4(0));
-#ifdef USE_EXCLUSIONS
-            exclusionIndices = exclusions[acceptorIndex];
-#endif
-        }
-        else
-            atoms = make_int4(-1, -1, -1, -1);
-        for (int donorStart = 0; donorStart < NUM_DONORS; donorStart += blockDim.x) {
-            // Load the next block of donors into local memory.
-
-            __syncthreads();
-            int blockSize = min((int) blockDim.x, NUM_DONORS-donorStart);
-            if (threadIdx.x < blockSize) {
-                int4 atoms2 = donorAtoms[donorStart+threadIdx.x];
-                posBuffer[3*threadIdx.x] = (atoms2.x > -1 ? posq[atoms2.x] : make_real4(0));
-                posBuffer[3*threadIdx.x+1] = (atoms2.y > -1 ? posq[atoms2.y] : make_real4(0));
-                posBuffer[3*threadIdx.x+2] = (atoms2.z > -1 ? posq[atoms2.z] : make_real4(0));
-            }
-            __syncthreads();
-            if (acceptorIndex < NUM_ACCEPTORS) {
-                for (int index = 0; index < blockSize; index++) {
-                    int donorIndex = donorStart+index;
-#ifdef USE_EXCLUSIONS
-                    if (donorIndex == exclusionIndices.x || donorIndex == exclusionIndices.y || donorIndex == exclusionIndices.z || donorIndex == exclusionIndices.w)
-                        continue;
-#endif
-                    // Compute the interaction between a donor and an acceptor.
-
-                    real4 d1 = posBuffer[3*index];
-                    real4 d2 = posBuffer[3*index+1];
-                    real4 d3 = posBuffer[3*index+2];
-                    real4 deltaD1A1 = delta(d1, a1, periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ);
-#ifdef USE_CUTOFF
-                    if (deltaD1A1.w < CUTOFF_SQUARED) {
-#endif
-                        COMPUTE_ACCEPTOR_FORCE
-#ifdef USE_CUTOFF
-                    }
-#endif
-                }
-            }
-        }
-
-        // Write results
-
-        if (acceptorIndex < NUM_ACCEPTORS) {
-            if (atoms.x > -1) {
-                atomicAdd(&force[atoms.x], static_cast<unsigned long long>((long long) (f1.x*0x100000000)));
-                atomicAdd(&force[atoms.x+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f1.y*0x100000000)));
-                atomicAdd(&force[atoms.x+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f1.z*0x100000000)));
-                __threadfence_block();
-            }
-            if (atoms.y > -1) {
-                atomicAdd(&force[atoms.y], static_cast<unsigned long long>((long long) (f2.x*0x100000000)));
-                atomicAdd(&force[atoms.y+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f2.y*0x100000000)));
-                atomicAdd(&force[atoms.y+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f2.z*0x100000000)));
-                __threadfence_block();
-            }
-            if (atoms.z > -1) {
-                atomicAdd(&force[atoms.z], static_cast<unsigned long long>((long long) (f3.x*0x100000000)));
-                atomicAdd(&force[atoms.z+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f3.y*0x100000000)));
-                atomicAdd(&force[atoms.z+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f3.z*0x100000000)));
-                __threadfence_block();
-            }
-        }
-    }
-}
--- a/platforms/cuda/src/kernels/customIntegratorPerDof.cu
+++ b/platforms/cuda/src/kernels/customIntegratorPerDof.cu
-/**
- * Load the position of a particle.
- */
-inline __device__ mixed4 loadPos(const real4* __restrict__ posq, const real4* __restrict__ posqCorrection, int index) {
-#ifdef USE_MIXED_PRECISION
-    real4 pos1 = posq[index];
-    real4 pos2 = posqCorrection[index];
-    return make_mixed4(pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
-#else
-    return posq[index];
-#endif
-}
-
-/**
- * Store the position of a particle.
- */
-inline __device__ void storePos(real4* __restrict__ posq, real4* __restrict__ posqCorrection, int index, mixed4 pos) {
-#ifdef USE_MIXED_PRECISION
-    posq[index] = make_real4((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w);
-    posqCorrection[index] = make_real4(pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
-#else
-    posq[index] = pos;
-#endif
-}
-
-inline __device__ double4 convertToDouble4(float4 a) {
-    return make_double4(a.x, a.y, a.z, a.w);
-}
-
-inline __device__ double4 convertToDouble4(double4 a) {
-    return a;
-}
-
-inline __device__ mixed4 convertFromDouble4(double4 a) {
-    return make_mixed4(a.x, a.y, a.z, a.w);
-}
-
-extern "C" __global__ void computePerDof(real4* __restrict__ posq, real4* __restrict__ posqCorrection, mixed4* __restrict__ posDelta,
-        mixed4* __restrict__ velm, const long long* __restrict__ force, const mixed2* __restrict__ dt, const mixed* __restrict__ globals,
-        mixed* __restrict__ sum, const float4* __restrict__ gaussianValues, unsigned int gaussianBaseIndex, const float4* __restrict__ uniformValues,
-        const mixed energy, mixed* __restrict__ energyParamDerivs
-        PARAMETER_ARGUMENTS) {
-    double3 stepSize = make_double3(dt[0].y);
-    int index = blockIdx.x*blockDim.x+threadIdx.x;
-    const double forceScale = 1.0/0xFFFFFFFF;
-    while (index < NUM_ATOMS) {
-#ifdef LOAD_POS_AS_DELTA
-        double4 position = convertToDouble4(loadPos(posq, posqCorrection, index)+posDelta[index]);
-#else
-        double4 position = convertToDouble4(loadPos(posq, posqCorrection, index));
-#endif
-        double4 velocity = convertToDouble4(velm[index]);
-        double4 f = make_double4(forceScale*force[index], forceScale*force[index+PADDED_NUM_ATOMS], forceScale*force[index+PADDED_NUM_ATOMS*2], 0.0);
-        double3 mass = make_double3(1.0/velocity.w);
-        if (velocity.w != 0.0) {
-            int gaussianIndex = gaussianBaseIndex;
-            int uniformIndex = 0;
-            COMPUTE_STEP
-        }
-        index += blockDim.x*gridDim.x;
-    }
-}
--- a/platforms/cuda/src/kernels/customManyParticle.cu
+++ b/platforms/cuda/src/kernels/customManyParticle.cu
-/**
- * Record the force on an atom to global memory.
- */
-inline __device__ void storeForce(int atom, real3 force, unsigned long long* __restrict__ forceBuffers) {
-    atomicAdd(&forceBuffers[atom], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
-    atomicAdd(&forceBuffers[atom+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
-    atomicAdd(&forceBuffers[atom+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
-}
-
-/**
- * Convert a real4 to a real3 by removing its last element.
- */
-inline __device__ real3 trim(real4 v) {
-    return make_real3(v.x, v.y, v.z);
-}
-
-/**
- * Compute the difference between two vectors, taking periodic boundary conditions into account
- * and setting the fourth component to the squared magnitude.
- */
-inline __device__ real4 delta(real3 vec1, real3 vec2, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ) {
-    real4 result = make_real4(vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0.0f);
-#ifdef USE_PERIODIC
-    APPLY_PERIODIC_TO_DELTA(result)
-#endif
-    result.w = result.x*result.x + result.y*result.y + result.z*result.z;
-    return result;
-}
-
-/**
- * Compute the angle between two vectors.  The w component of each vector should contain the squared magnitude.
- */
-__device__ real computeAngle(real4 vec1, real4 vec2) {
-    real dotProduct = vec1.x*vec2.x + vec1.y*vec2.y + vec1.z*vec2.z;
-    real cosine = dotProduct*RSQRT(vec1.w*vec2.w);
-    real angle;
-    if (cosine > 0.99f || cosine < -0.99f) {
-        // We're close to the singularity in acos(), so take the cross product and use asin() instead.
-
-        real3 crossProduct = cross(vec1, vec2);
-        real scale = vec1.w*vec2.w;
-        angle = ASIN(SQRT(dot(crossProduct, crossProduct)/scale));
-        if (cosine < 0.0f)
-            angle = M_PI-angle;
-    }
-    else
-       angle = ACOS(cosine);
-    return angle;
-}
-
-/**
- * Compute the cross product of two vectors, setting the fourth component to the squared magnitude.
- */
-inline __device__ real4 computeCross(real4 vec1, real4 vec2) {
-    real3 cp = cross(vec1, vec2);
-    return make_real4(cp.x, cp.y, cp.z, cp.x*cp.x+cp.y*cp.y+cp.z*cp.z);
-}
-
-/**
- * Determine whether a particular interaction is in the list of exclusions.
- */
-inline __device__ bool isInteractionExcluded(int atom1, int atom2, const int* __restrict__ exclusions, const int* __restrict__ exclusionStartIndex) {
-    if (atom1 > atom2) {
-        int temp = atom1;
-        atom1 = atom2;
-        atom2 = temp;
-    }
-    int first = exclusionStartIndex[atom1];
-    int last = exclusionStartIndex[atom1+1];
-    for (int i = last-1; i >= first; i--) {
-        int excluded = exclusions[i];
-        if (excluded == atom2)
-            return true;
-        if (excluded <= atom1)
-            return false;
-    }
-    return false;
-}
-
-__constant__ float globals[NUM_GLOBALS];
-
-/**
- * Compute the interaction.
- */
-extern "C" __global__ void computeInteraction(
-        unsigned long long* __restrict__ forceBuffers, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq,
-        real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
-#ifdef USE_CUTOFF
-        , const int* __restrict__ neighbors, const int* __restrict__ neighborStartIndex
-#endif
-#ifdef USE_FILTERS
-        , int* __restrict__ particleTypes, int* __restrict__ orderIndex, int* __restrict__ particleOrder
-#endif
-#ifdef USE_EXCLUSIONS
-        , int* __restrict__ exclusions, int* __restrict__ exclusionStartIndex
-#endif
-        PARAMETER_ARGUMENTS) {
-    mixed energy = 0;
-    
-    // Loop over particles to be the first one in the set.
-    
-    for (int p1 = blockIdx.x; p1 < NUM_ATOMS; p1 += gridDim.x) {
-#ifdef USE_CENTRAL_PARTICLE
-        const int a1 = p1;
-#else
-        const int a1 = 0;
-#endif
-#ifdef USE_CUTOFF
-        int firstNeighbor = neighborStartIndex[p1];
-        int numNeighbors = neighborStartIndex[p1+1]-firstNeighbor;
-#else
-  #ifdef USE_CENTRAL_PARTICLE
-        int numNeighbors = NUM_ATOMS;
-  #else
-        int numNeighbors = NUM_ATOMS-p1-1;
-  #endif
-#endif
-        int numCombinations = NUM_CANDIDATE_COMBINATIONS;
-        for (int index = threadIdx.x; index < numCombinations; index += blockDim.x) {
-            FIND_ATOMS_FOR_COMBINATION_INDEX;
-            bool includeInteraction = IS_VALID_COMBINATION;
-#ifdef USE_CUTOFF
-            if (includeInteraction) {
-                VERIFY_CUTOFF;
-            }
-#endif
-#ifdef USE_FILTERS
-            int order = orderIndex[COMPUTE_TYPE_INDEX];
-            if (order == -1)
-                includeInteraction = false;
-#endif
-#ifdef USE_EXCLUSIONS
-            if (includeInteraction) {
-                VERIFY_EXCLUSIONS;
-            }
-#endif
-            if (includeInteraction) {
-                PERMUTE_ATOMS;
-                LOAD_PARTICLE_DATA;
-                COMPUTE_INTERACTION;
-            }
-        }
-    }
-    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
-}
-
-/**
- * Find a bounding box for the atoms in each block.
- */
-extern "C" __global__ void findBlockBounds(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
-        const real4* __restrict__ posq, real4* __restrict__ blockCenter, real4* __restrict__ blockBoundingBox, int* __restrict__ numNeighborPairs) {
-    int index = blockIdx.x*blockDim.x+threadIdx.x;
-    int base = index*TILE_SIZE;
-    while (base < NUM_ATOMS) {
-        real4 pos = posq[base];
-#ifdef USE_PERIODIC
-        APPLY_PERIODIC_TO_POS(pos)
-#endif
-        real4 minPos = pos;
-        real4 maxPos = pos;
-        int last = min(base+TILE_SIZE, NUM_ATOMS);
-        for (int i = base+1; i < last; i++) {
-            pos = posq[i];
-#ifdef USE_PERIODIC
-            real4 center = 0.5f*(maxPos+minPos);
-            APPLY_PERIODIC_TO_POS_WITH_CENTER(pos, center)
-#endif
-            minPos = make_real4(min(minPos.x,pos.x), min(minPos.y,pos.y), min(minPos.z,pos.z), 0);
-            maxPos = make_real4(max(maxPos.x,pos.x), max(maxPos.y,pos.y), max(maxPos.z,pos.z), 0);
-        }
-        real4 blockSize = 0.5f*(maxPos-minPos);
-        blockBoundingBox[index] = blockSize;
-        blockCenter[index] = 0.5f*(maxPos+minPos);
-        index += blockDim.x*gridDim.x;
-        base = index*TILE_SIZE;
-    }
-    if (blockIdx.x == 0 && threadIdx.x == 0)
-        *numNeighborPairs = 0;
-}
-
-/**
- * Find a list of neighbors for each atom.
- */
-extern "C" __global__ void findNeighbors(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
-        const real4* __restrict__ posq, const real4* __restrict__ blockCenter, const real4* __restrict__ blockBoundingBox, int2* __restrict__ neighborPairs,
-        int* __restrict__ numNeighborPairs, int* __restrict__ numNeighborsForAtom, int maxNeighborPairs
-#ifdef USE_EXCLUSIONS
-        , const int* __restrict__ exclusions, const int* __restrict__ exclusionStartIndex
-#endif
-        ) {
-    __shared__ real3 positionCache[FIND_NEIGHBORS_WORKGROUP_SIZE];
-    int indexInWarp = threadIdx.x%32;
-    for (int atom1 = blockIdx.x*blockDim.x+threadIdx.x; atom1 < PADDED_NUM_ATOMS; atom1 += blockDim.x*gridDim.x) {
-        // Load data for this atom.  Note that all threads in a warp are processing atoms from the same block.
-        
-        real3 pos1 = trim(posq[atom1]);
-        int block1 = atom1/TILE_SIZE;
-        real4 blockCenter1 = blockCenter[block1];
-        real4 blockSize1 = blockBoundingBox[block1];
-        int totalNeighborsForAtom1 = 0;
-        
-        // Loop over atom blocks to search for neighbors.  The threads in a warp compare block1 against 32
-        // other blocks in parallel.
-
-#ifdef USE_CENTRAL_PARTICLE
-        int startBlock = 0;
-#else
-        int startBlock = block1;
-#endif
-        for (int block2Base = startBlock; block2Base < NUM_BLOCKS; block2Base += 32) {
-            int block2 = block2Base+indexInWarp;
-            bool includeBlock2 = (block2 < NUM_BLOCKS);
-            if (includeBlock2) {
-                real4 blockCenter2 = blockCenter[block2];
-                real4 blockSize2 = blockBoundingBox[block2];
-                real4 blockDelta = blockCenter1-blockCenter2;
-#ifdef USE_PERIODIC
-                APPLY_PERIODIC_TO_DELTA(blockDelta)
-#endif
-                blockDelta.x = max(0.0f, fabs(blockDelta.x)-blockSize1.x-blockSize2.x);
-                blockDelta.y = max(0.0f, fabs(blockDelta.y)-blockSize1.y-blockSize2.y);
-                blockDelta.z = max(0.0f, fabs(blockDelta.z)-blockSize1.z-blockSize2.z);
-                includeBlock2 &= (blockDelta.x*blockDelta.x+blockDelta.y*blockDelta.y+blockDelta.z*blockDelta.z < CUTOFF_SQUARED);
-            }
-            
-            // Loop over any blocks we identified as potentially containing neighbors.
-            
-            int includeBlockFlags = BALLOT(includeBlock2);
-            while (includeBlockFlags != 0) {
-                int i = __ffs(includeBlockFlags)-1;
-                includeBlockFlags &= includeBlockFlags-1;
-                int block2 = block2Base+i;
-
-                // Loop over atoms in this block.
-
-                int start = block2*TILE_SIZE;
-                int included[TILE_SIZE];
-                int numIncluded = 0;
-                positionCache[threadIdx.x] = trim(posq[start+indexInWarp]);
-                if (atom1 < NUM_ATOMS) {
-                    for (int j = 0; j < 32; j++) {
-                        int atom2 = start+j;
-                        real3 pos2 = positionCache[threadIdx.x-indexInWarp+j];
-
-                        // Decide whether to include this atom pair in the neighbor list.
-
-                        real4 atomDelta = delta(pos1, pos2, periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ);
-#ifdef USE_CENTRAL_PARTICLE
-                        bool includeAtom = (atom2 != atom1 && atom2 < NUM_ATOMS && atomDelta.w < CUTOFF_SQUARED);
-#else
-                        bool includeAtom = (atom2 > atom1 && atom2 < NUM_ATOMS && atomDelta.w < CUTOFF_SQUARED);
-#endif
-#ifdef USE_EXCLUSIONS
-                        if (includeAtom)
-                            includeAtom &= !isInteractionExcluded(atom1, atom2, exclusions, exclusionStartIndex);
-#endif
-                        if (includeAtom)
-                            included[numIncluded++] = atom2;
-                    }
-                }
-
-                // If we found any neighbors, store them to the neighbor list.
-
-                if (numIncluded > 0) {
-                    int baseIndex = atomicAdd(numNeighborPairs, numIncluded);
-                    if (baseIndex+numIncluded <= maxNeighborPairs)
-                        for (int j = 0; j < numIncluded; j++)
-                            neighborPairs[baseIndex+j] = make_int2(atom1, included[j]);
-                    totalNeighborsForAtom1 += numIncluded;
-                }
-            }
-        }
-        if (atom1 < NUM_ATOMS)
-            numNeighborsForAtom[atom1] = totalNeighborsForAtom1;
-    }
-}
-
-/**
- * Sum the neighbor counts to compute the start position of each atom.  This kernel
- * is executed as a single work group.
- */
-extern "C" __global__ void computeNeighborStartIndices(int* __restrict__ numNeighborsForAtom, int* __restrict__ neighborStartIndex,
-            int* __restrict__ numNeighborPairs, int maxNeighborPairs) {
-    extern __shared__ unsigned int posBuffer[];
-    if (*numNeighborPairs > maxNeighborPairs) {
-        // There wasn't enough memory for the neighbor list, so we'll need to rebuild it.  Set the neighbor start
-        // indices to indicate no neighbors for any atom.
-        
-        for (int i = threadIdx.x; i <= NUM_ATOMS; i += blockDim.x)
-            neighborStartIndex[i] = 0;
-        return;
-    }
-    unsigned int globalOffset = 0;
-    for (unsigned int startAtom = 0; startAtom < NUM_ATOMS; startAtom += blockDim.x) {
-        // Load the neighbor counts into local memory.
-
-        unsigned int globalIndex = startAtom+threadIdx.x;
-        posBuffer[threadIdx.x] = (globalIndex < NUM_ATOMS ? numNeighborsForAtom[globalIndex] : 0);
-        __syncthreads();
-
-        // Perform a parallel prefix sum.
-
-        for (unsigned int step = 1; step < blockDim.x; step *= 2) {
-            unsigned int add = (threadIdx.x >= step ? posBuffer[threadIdx.x-step] : 0);
-            __syncthreads();
-            posBuffer[threadIdx.x] += add;
-            __syncthreads();
-        }
-
-        // Write the results back to global memory.
-
-        if (globalIndex < NUM_ATOMS) {
-            neighborStartIndex[globalIndex+1] = posBuffer[threadIdx.x]+globalOffset;
-            numNeighborsForAtom[globalIndex] = 0; // Clear this so the next kernel can use it as a counter
-        }
-        globalOffset += posBuffer[blockDim.x-1];
-        __syncthreads();
-    }
-    if (threadIdx.x == 0)
-        neighborStartIndex[0] = 0;
-}
-
-/**
- * Assemble the final neighbor list.
- */
-extern "C" __global__ void copyPairsToNeighborList(const int2* __restrict__ neighborPairs, int* __restrict__ neighbors, int* __restrict__ numNeighborPairs,
-            int maxNeighborPairs, int* __restrict__ numNeighborsForAtom, const int* __restrict__ neighborStartIndex) {
-    int actualPairs = *numNeighborPairs;
-    if (actualPairs > maxNeighborPairs)
-        return; // There wasn't enough memory for the neighbor list, so we'll need to rebuild it.
-    for (unsigned int index = blockDim.x*blockIdx.x+threadIdx.x; index < actualPairs; index += blockDim.x*gridDim.x) {
-        int2 pair = neighborPairs[index];
-        int startIndex = neighborStartIndex[pair.x];
-        int offset = atomicAdd(numNeighborsForAtom+pair.x, 1);
-        neighbors[startIndex+offset] = pair.y;
-    }
-}
--- a/platforms/cuda/src/kernels/customNonbondedGroups.cu
+++ b/platforms/cuda/src/kernels/customNonbondedGroups.cu
-typedef struct {
-    real x, y, z;
-    real q;
-    real fx, fy, fz;
-    ATOM_PARAMETER_DATA
-#ifndef PARAMETER_SIZE_IS_EVEN
-    real padding;
-#endif
-} AtomData;
-
-/**
- * Find the maximum of a value across all threads in a warp, and return that to
- * every thread.  This is only needed on Volta and later.  On earlier architectures,
- * we can just return the value that was passed in.
- */
-__device__ int reduceMax(int val) {
-#if __CUDA_ARCH__ >= 700
-    for (int mask = 16; mask > 0; mask /= 2) 
-        val = max(val, __shfl_xor_sync(0xffffffff, val, mask));
-#endif
-    return val;
-}
-
-extern "C" __global__ void computeInteractionGroups(
-        unsigned long long* __restrict__ forceBuffers, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq, const int4* __restrict__ groupData,
-        const int* __restrict__ numGroupTiles, bool useNeighborList,
-        real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
-        PARAMETER_ARGUMENTS) {
-    const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
-    const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE; // global warpIndex
-    const unsigned int tgx = threadIdx.x & (TILE_SIZE-1); // index within the warp
-    const unsigned int tbx = threadIdx.x - tgx;           // block warpIndex
-    mixed energy = 0;
-    INIT_DERIVATIVES
-    __shared__ AtomData localData[LOCAL_MEMORY_SIZE];
-
-    const unsigned int startTile = (useNeighborList ? warp*numGroupTiles[0]/totalWarps : FIRST_TILE+warp*(LAST_TILE-FIRST_TILE)/totalWarps);
-    const unsigned int endTile = (useNeighborList ? (warp+1)*numGroupTiles[0]/totalWarps : FIRST_TILE+(warp+1)*(LAST_TILE-FIRST_TILE)/totalWarps);
-    for (int tile = startTile; tile < endTile; tile++) {
-        const int4 atomData = groupData[TILE_SIZE*tile+tgx];
-        const int atom1 = atomData.x;
-        const int atom2 = atomData.y;
-        const int rangeStart = atomData.z&0xFFFF;
-        const int rangeEnd = (atomData.z>>16)&0xFFFF;
-        const int exclusions = atomData.w;
-        real4 posq1 = posq[atom1];
-        LOAD_ATOM1_PARAMETERS
-        real3 force = make_real3(0);
-        real4 posq2 = posq[atom2];
-        localData[threadIdx.x].x = posq2.x;
-        localData[threadIdx.x].y = posq2.y;
-        localData[threadIdx.x].z = posq2.z;
-        localData[threadIdx.x].q = posq2.w;
-        LOAD_LOCAL_PARAMETERS
-        localData[threadIdx.x].fx = 0.0f;
-        localData[threadIdx.x].fy = 0.0f;
-        localData[threadIdx.x].fz = 0.0f;
-        int tj = tgx;
-        int rangeStop = rangeStart + reduceMax(rangeEnd-rangeStart);
-        SYNC_WARPS;
-        for (int j = rangeStart; j < rangeStop; j++) {
-            if (j < rangeEnd) {
-                bool isExcluded = (((exclusions>>tj)&1) == 0);
-                int localIndex = tbx+tj;
-                posq2 = make_real4(localData[localIndex].x, localData[localIndex].y, localData[localIndex].z, localData[localIndex].q);
-                real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
-#ifdef USE_PERIODIC
-                APPLY_PERIODIC_TO_DELTA(delta)
-#endif
-                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-#ifdef USE_CUTOFF
-                if (!isExcluded && r2 < CUTOFF_SQUARED) {
-#endif
-                    real invR = RSQRT(r2);
-                    real r = r2*invR;
-                    LOAD_ATOM2_PARAMETERS
-                    real dEdR = 0.0f;
-                    real tempEnergy = 0.0f;
-                    const real interactionScale = 1.0f;
-                    COMPUTE_INTERACTION
-                    energy += tempEnergy;
-                    delta *= dEdR;
-                    force.x -= delta.x;
-                    force.y -= delta.y;
-                    force.z -= delta.z;
-                    localData[localIndex].fx += delta.x;
-                    localData[localIndex].fy += delta.y;
-                    localData[localIndex].fz += delta.z;
-#ifdef USE_CUTOFF
-                }
-#endif
-                tj = (tj == rangeEnd-1 ? rangeStart : tj+1);
-            }
-            SYNC_WARPS;
-        }
-        if (exclusions != 0) {
-            atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
-            atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
-            atomicAdd(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
-        }
-        atomicAdd(&forceBuffers[atom2], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fx*0x100000000)));
-        atomicAdd(&forceBuffers[atom2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fy*0x100000000)));
-        atomicAdd(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fz*0x100000000)));
-        SYNC_WARPS;
-    }
-    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
-    SAVE_DERIVATIVES
-}
-
-/**
- * If the neighbor list needs to be rebuilt, reset the number of tiles to 0.  This is
- * executed by a single thread.
- */
-extern "C" __global__  void prepareToBuildNeighborList(int* __restrict__ rebuildNeighborList, int* __restrict__ numGroupTiles) {
-    if (rebuildNeighborList[0] == 1)
-        numGroupTiles[0] = 0;
-}
-
-/**
- * Filter the list of tiles to include only ones that have interactions within the
- * padded cutoff.
- */
-extern "C" __global__  void buildNeighborList(int* __restrict__ rebuildNeighborList, int* __restrict__ numGroupTiles,
-        const real4* __restrict__ posq, const int4* __restrict__ groupData, int4* __restrict__ filteredGroupData,
-        real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ) {
-    
-    // If the neighbor list doesn't need to be rebuilt on this step, return immediately.
-    
-    if (rebuildNeighborList[0] == 0)
-        return;
-
-    const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
-    const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE; // global warpIndex
-    const unsigned int local_warp = threadIdx.x/TILE_SIZE; // local warpIndex
-    const unsigned int tgx = threadIdx.x & (TILE_SIZE-1); // index within the warp
-    const unsigned int tbx = threadIdx.x - tgx;           // block warpIndex
-
-    __shared__ real4 localPos[LOCAL_MEMORY_SIZE];
-    __shared__ volatile bool anyInteraction[WARPS_IN_BLOCK];
-    __shared__ volatile int tileIndex[WARPS_IN_BLOCK];
-
-    const unsigned int startTile = warp*NUM_TILES/totalWarps;
-    const unsigned int endTile = (warp+1)*NUM_TILES/totalWarps;
-    for (int tile = startTile; tile < endTile; tile++) {
-        const int4 atomData = groupData[TILE_SIZE*tile+tgx];
-        const int atom1 = atomData.x;
-        const int atom2 = atomData.y;
-        const int rangeStart = atomData.z&0xFFFF;
-        const int rangeEnd = (atomData.z>>16)&0xFFFF;
-        const int exclusions = atomData.w;
-        real4 posq1 = posq[atom1];
-        localPos[threadIdx.x] = posq[atom2];
-        if (tgx == 0)
-            anyInteraction[local_warp] = false;
-        int tj = tgx;
-        int rangeStop = rangeStart + reduceMax(rangeEnd-rangeStart);
-        SYNC_WARPS;
-        for (int j = rangeStart; j < rangeStop && !anyInteraction[local_warp]; j++) {
-            SYNC_WARPS;
-            if (j < rangeEnd && tj < rangeEnd) {
-                bool isExcluded = (((exclusions>>tj)&1) == 0);
-                int localIndex = tbx+tj;
-                real3 delta = make_real3(localPos[localIndex].x-posq1.x, localPos[localIndex].y-posq1.y, localPos[localIndex].z-posq1.z);
-#ifdef USE_PERIODIC
-                APPLY_PERIODIC_TO_DELTA(delta)
-#endif
-                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-                if (!isExcluded && r2 < PADDED_CUTOFF_SQUARED)
-                    anyInteraction[local_warp] = true;
-            }
-            tj = (tj == rangeEnd-1 ? rangeStart : tj+1);
-            SYNC_WARPS;
-        }
-        if (anyInteraction[local_warp]) {
-            SYNC_WARPS;
-            if (tgx == 0)
-                tileIndex[local_warp] = atomicAdd(numGroupTiles, 1);
-            SYNC_WARPS;
-            filteredGroupData[TILE_SIZE*tileIndex[local_warp]+tgx] = atomData;
-        }
-    }
-}
--- a/platforms/cuda/src/kernels/gbsaObc1.cu
+++ b/platforms/cuda/src/kernels/gbsaObc1.cu
-#define DIELECTRIC_OFFSET 0.009f
-#define PROBE_RADIUS 0.14f
-#define WARPS_PER_GROUP (FORCE_WORK_GROUP_SIZE/TILE_SIZE)
-
-/**
- * Reduce the Born sums to compute the Born radii.
- */
-
-extern "C" __global__ void reduceBornSum(float alpha, float beta, float gamma, const long long* __restrict__ bornSum,
-            const float2* __restrict__ params, real* __restrict__ bornRadii, real* __restrict__ obcChain) {
-    for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
-        // Get summed Born data
-
-        real sum = RECIP(0x100000000)*bornSum[index];
-
-        // Now calculate Born radius and OBC term.
-
-        float offsetRadius = params[index].x;
-        sum *= 0.5f*offsetRadius;
-        real sum2 = sum*sum;
-        real sum3 = sum*sum2;
-        real tanhSum = tanh(alpha*sum - beta*sum2 + gamma*sum3);
-        real nonOffsetRadius = offsetRadius + DIELECTRIC_OFFSET;
-        real radius = RECIP(RECIP(offsetRadius) - tanhSum/nonOffsetRadius);
-        real chain = offsetRadius*(alpha - 2.0f*beta*sum + 3.0f*gamma*sum2);
-        chain = (1-tanhSum*tanhSum)*chain / nonOffsetRadius;
-        bornRadii[index] = radius;
-        obcChain[index] = chain;
-    }
-}
-
-/**
- * Reduce the Born force.
- */
-
-extern "C" __global__ void reduceBornForce(long long* __restrict__ bornForce, mixed* __restrict__ energyBuffer,
-        const float2* __restrict__ params, const real* __restrict__ bornRadii, const real* __restrict__ obcChain) {
-    mixed energy = 0;
-    for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
-        // Get summed Born force
-
-        real force = RECIP(0x100000000)*bornForce[index];
-
-        // Now calculate the actual force
-
-        float offsetRadius = params[index].x;
-        real bornRadius = bornRadii[index];
-        real r = offsetRadius+DIELECTRIC_OFFSET+PROBE_RADIUS;
-        real ratio6 = POW((offsetRadius+DIELECTRIC_OFFSET)/bornRadius, 6);
-        real saTerm = SURFACE_AREA_FACTOR*r*r*ratio6;
-        force += saTerm/bornRadius;
-        energy += saTerm;
-        force *= bornRadius*bornRadius*obcChain[index];
-        bornForce[index] = (long long) (force*0x100000000);
-    }
-    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy/-6;
-}
-
-typedef struct {
-    real x, y, z;
-    real q;
-    float radius, scaledRadius;
-    real bornSum;
-} AtomData1;
-
-/**
- * Compute the Born sum.
- */
-extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ global_bornSum, const real4* __restrict__ posq, const real* __restrict__ charge, const float2* __restrict__ global_params,
-#ifdef USE_CUTOFF
-        const int* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
-        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, const real4* __restrict__ blockCenter,
-        const real4* __restrict__ blockSize, const unsigned int* __restrict__ interactingAtoms,
-#else
-        unsigned int numTiles,
-#endif
-        const ushort2* __restrict__ exclusionTiles) {
-    const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
-    const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
-    const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
-    const unsigned int tbx = threadIdx.x - tgx;
-    __shared__ AtomData1 localData[FORCE_WORK_GROUP_SIZE];
-
-    // First loop: process tiles that contain exclusions.
-    
-    const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
-    const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
-    for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
-        const ushort2 tileIndices = exclusionTiles[pos];
-        const unsigned int x = tileIndices.x;
-        const unsigned int y = tileIndices.y;
-        real bornSum = 0;
-        unsigned int atom1 = x*TILE_SIZE + tgx;
-        real4 posq1 = posq[atom1];
-        real charge1 = charge[atom1];
-        float2 params1 = global_params[atom1];
-        if (x == y) {
-            // This tile is on the diagonal.
-
-            localData[threadIdx.x].x = posq1.x;
-            localData[threadIdx.x].y = posq1.y;
-            localData[threadIdx.x].z = posq1.z;
-            localData[threadIdx.x].q = charge1;
-            localData[threadIdx.x].radius = params1.x;
-            localData[threadIdx.x].scaledRadius = params1.y;
-            for (unsigned int j = 0; j < TILE_SIZE; j++) {
-                real3 delta = make_real3(localData[tbx+j].x-posq1.x, localData[tbx+j].y-posq1.y, localData[tbx+j].z-posq1.z);
-#ifdef USE_PERIODIC
-                APPLY_PERIODIC_TO_DELTA(delta)
-#endif
-                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-#ifdef USE_CUTOFF
-                if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
-#else
-                if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
-#endif
-                    real invR = RSQRT(r2);
-                    real r = r2*invR;
-                    float2 params2 = make_float2(localData[tbx+j].radius, localData[tbx+j].scaledRadius);
-                    real rScaledRadiusJ = r+params2.y;
-                    if ((j != tgx) && (params1.x < rScaledRadiusJ)) {
-                        real l_ij = RECIP(max(params1.x, fabs(r-params2.y)));
-                        real u_ij = RECIP(rScaledRadiusJ);
-                        real l_ij2 = l_ij*l_ij;
-                        real u_ij2 = u_ij*u_ij;
-                        real ratio = LOG(u_ij * RECIP(l_ij));
-                        bornSum += l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
-                                            (params2.y*params2.y*invR)*(l_ij2-u_ij2));
-                        bornSum += (params1.x < params2.y-r ? 2.0f*(RECIP(params1.x)-l_ij) : 0);
-                    }
-                }
-            }
-        }
-        else {
-            // This is an off-diagonal tile.
-
-            unsigned int j = y*TILE_SIZE + tgx;
-            real4 tempPosq = posq[j];
-            localData[threadIdx.x].x = tempPosq.x;
-            localData[threadIdx.x].y = tempPosq.y;
-            localData[threadIdx.x].z = tempPosq.z;
-            localData[threadIdx.x].q = charge[j];
-            float2 tempParams = global_params[j];
-            localData[threadIdx.x].radius = tempParams.x;
-            localData[threadIdx.x].scaledRadius = tempParams.y;
-            localData[threadIdx.x].bornSum = 0.0f;
-
-            // Compute the full set of interactions in this tile.
-
-            unsigned int tj = tgx;
-            for (j = 0; j < TILE_SIZE; j++) {
-                real3 delta = make_real3(localData[tbx+tj].x-posq1.x, localData[tbx+tj].y-posq1.y, localData[tbx+tj].z-posq1.z);
-#ifdef USE_PERIODIC
-                APPLY_PERIODIC_TO_DELTA(delta)
-#endif
-                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-#ifdef USE_CUTOFF
-                if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
-#else
-                if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS) {
-#endif
-                    real invR = RSQRT(r2);
-                    real r = r2*invR;
-                    float2 params2 = make_float2(localData[tbx+tj].radius, localData[tbx+tj].scaledRadius);
-                    real rScaledRadiusJ = r+params2.y;
-                    if (params1.x < rScaledRadiusJ) {
-                        real l_ij = RECIP(max(params1.x, fabs(r-params2.y)));
-                        real u_ij = RECIP(rScaledRadiusJ);
-                        real l_ij2 = l_ij*l_ij;
-                        real u_ij2 = u_ij*u_ij;
-                        real ratio = LOG(u_ij * RECIP(l_ij));
-                        bornSum += l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
-                                            (params2.y*params2.y*invR)*(l_ij2-u_ij2));
-                        bornSum += (params1.x < params2.y-r ? 2.0f*(RECIP(params1.x)-l_ij) : 0);
-                    }
-                    real rScaledRadiusI = r+params1.y;
-                    if (params2.x < rScaledRadiusI) {
-                        real l_ij = RECIP(max(params2.x, fabs(r-params1.y)));
-                        real u_ij = RECIP(rScaledRadiusI);
-                        real l_ij2 = l_ij*l_ij;
-                        real u_ij2 = u_ij*u_ij;
-                        real ratio = LOG(u_ij * RECIP(l_ij));
-                        real term = l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
-                                            (params1.y*params1.y*invR)*(l_ij2-u_ij2));
-                        term += (params2.x < params1.y-r ? 2.0f*(RECIP(params2.x)-l_ij) : 0);
-                        localData[tbx+tj].bornSum += term;
-                    }
-                }
-                tj = (tj + 1) & (TILE_SIZE - 1);
-            }
-        }
-        
-        // Write results.
-        
-        unsigned int offset = x*TILE_SIZE + tgx;
-        atomicAdd(&global_bornSum[offset], static_cast<unsigned long long>((long long) (bornSum*0x100000000)));
-        if (x != y) {
-            offset = y*TILE_SIZE + tgx;
-            atomicAdd(&global_bornSum[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].bornSum*0x100000000)));
-        }
-    }
-
-    // Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
-    // of them (no cutoff).
-
-#ifdef USE_CUTOFF
-    unsigned int numTiles = interactionCount[0];
-    if (numTiles > maxTiles)
-        return; // There wasn't enough memory for the neighbor list.
-    int pos = (int) (warp*(numTiles > maxTiles ? NUM_BLOCKS*((long long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
-    int end = (int) ((warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
-#else
-    int pos = (int) (warp*(long long)numTiles/totalWarps);
-    int end = (int) ((warp+1)*(long long)numTiles/totalWarps);
-#endif
-    int skipBase = 0;
-    int currentSkipIndex = tbx;
-    __shared__ int atomIndices[FORCE_WORK_GROUP_SIZE];
-    __shared__ volatile int skipTiles[FORCE_WORK_GROUP_SIZE];
-    skipTiles[threadIdx.x] = -1;
-
-    while (pos < end) {
-        real bornSum = 0;
-        bool includeTile = true;
-
-        // Extract the coordinates of this tile.
-        
-        int x, y;
-        bool singlePeriodicCopy = false;
-#ifdef USE_CUTOFF
-            x = tiles[pos];
-            real4 blockSizeX = blockSize[x];
-            singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= CUTOFF &&
-                                  0.5f*periodicBoxSize.y-blockSizeX.y >= CUTOFF &&
-                                  0.5f*periodicBoxSize.z-blockSizeX.z >= CUTOFF);
-#else
-        y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
-        x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-        if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
-            y += (x < y ? -1 : 1);
-            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-        }
-
-        // Skip over tiles that have exclusions, since they were already processed.
-
-        while (skipTiles[tbx+TILE_SIZE-1] < pos) {
-            if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
-                ushort2 tile = exclusionTiles[skipBase+tgx];
-                skipTiles[threadIdx.x] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
-            }
-            else
-                skipTiles[threadIdx.x] = end;
-            skipBase += TILE_SIZE;            
-            currentSkipIndex = tbx;
-        }
-        while (skipTiles[currentSkipIndex] < pos)
-            currentSkipIndex++;
-        includeTile = (skipTiles[currentSkipIndex] != pos);
-#endif
-        if (includeTile) {
-            unsigned int atom1 = x*TILE_SIZE + tgx;
-
-            // Load atom data for this tile.
-
-            real4 posq1 = posq[atom1];
-            real charge1 = charge[atom1];
-            float2 params1 = global_params[atom1];
-#ifdef USE_CUTOFF
-            unsigned int j = interactingAtoms[pos*TILE_SIZE+tgx];
-#else
-            unsigned int j = y*TILE_SIZE + tgx;
-#endif
-            atomIndices[threadIdx.x] = j;
-            if (j < PADDED_NUM_ATOMS) {
-                real4 tempPosq = posq[j];
-                localData[threadIdx.x].x = tempPosq.x;
-                localData[threadIdx.x].y = tempPosq.y;
-                localData[threadIdx.x].z = tempPosq.z;
-                localData[threadIdx.x].q = charge[j];
-                float2 tempParams = global_params[j];
-                localData[threadIdx.x].radius = tempParams.x;
-                localData[threadIdx.x].scaledRadius = tempParams.y;
-                localData[threadIdx.x].bornSum = 0.0f;
-            }
-#ifdef USE_PERIODIC
-            if (singlePeriodicCopy) {
-                // The box is small enough that we can just translate all the atoms into a single periodic
-                // box, then skip having to apply periodic boundary conditions later.
-
-                real4 blockCenterX = blockCenter[x];
-                APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX)
-                APPLY_PERIODIC_TO_POS_WITH_CENTER(localData[threadIdx.x], blockCenterX)
-                unsigned int tj = tgx;
-                for (j = 0; j < TILE_SIZE; j++) {
-                    real3 delta = make_real3(localData[tbx+tj].x-posq1.x, localData[tbx+tj].y-posq1.y, localData[tbx+tj].z-posq1.z);
-                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-                    int atom2 = atomIndices[tbx+tj];
-                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
-                        real invR = RSQRT(r2);
-                        real r = r2*invR;
-                        float2 params2 = make_float2(localData[tbx+tj].radius, localData[tbx+tj].scaledRadius);
-                        real rScaledRadiusJ = r+params2.y;
-                        if (params1.x < rScaledRadiusJ) {
-                            real l_ij = RECIP(max(params1.x, fabs(r-params2.y)));
-                            real u_ij = RECIP(rScaledRadiusJ);
-                            real l_ij2 = l_ij*l_ij;
-                            real u_ij2 = u_ij*u_ij;
-                            real ratio = LOG(u_ij * RECIP(l_ij));
-                            bornSum += l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
-                                                (params2.y*params2.y*invR)*(l_ij2-u_ij2));
-                            bornSum += (params1.x < params2.y-r ? 2.0f*(RECIP(params1.x)-l_ij) : 0);
-                        }
-                        real rScaledRadiusI = r+params1.y;
-                        if (params2.x < rScaledRadiusI) {
-                            real l_ij = RECIP(max(params2.x, fabs(r-params1.y)));
-                            real u_ij = RECIP(rScaledRadiusI);
-                            real l_ij2 = l_ij*l_ij;
-                            real u_ij2 = u_ij*u_ij;
-                            real ratio = LOG(u_ij * RECIP(l_ij));
-                            real term = l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
-                                                (params1.y*params1.y*invR)*(l_ij2-u_ij2));
-                            term += (params2.x < params1.y-r ? 2.0f*(RECIP(params2.x)-l_ij) : 0);
-                            localData[tbx+tj].bornSum += term;
-                        }
-                    }
-                    tj = (tj + 1) & (TILE_SIZE - 1);
-                }
-            }
-            else
-#endif
-            {
-                // We need to apply periodic boundary conditions separately for each interaction.
-
-                unsigned int tj = tgx;
-                for (j = 0; j < TILE_SIZE; j++) {
-                    real3 delta = make_real3(localData[tbx+tj].x-posq1.x, localData[tbx+tj].y-posq1.y, localData[tbx+tj].z-posq1.z);
-#ifdef USE_PERIODIC
-                    APPLY_PERIODIC_TO_DELTA(delta)
-#endif
-                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-                    int atom2 = atomIndices[tbx+tj];
-#ifdef USE_CUTOFF
-                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
-#else
-                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
-#endif
-                        real invR = RSQRT(r2);
-                        real r = r2*invR;
-                        float2 params2 = make_float2(localData[tbx+tj].radius, localData[tbx+tj].scaledRadius);
-                        real rScaledRadiusJ = r+params2.y;
-                        if (params1.x < rScaledRadiusJ) {
-                            real l_ij = RECIP(max(params1.x, fabs(r-params2.y)));
-                            real u_ij = RECIP(rScaledRadiusJ);
-                            real l_ij2 = l_ij*l_ij;
-                            real u_ij2 = u_ij*u_ij;
-                            real ratio = LOG(u_ij * RECIP(l_ij));
-                            bornSum += l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
-                                                (params2.y*params2.y*invR)*(l_ij2-u_ij2));
-                            bornSum += (params1.x < params2.y-r ? 2.0f*(RECIP(params1.x)-l_ij) : 0);
-                        }
-                        real rScaledRadiusI = r+params1.y;
-                        if (params2.x < rScaledRadiusI) {
-                            real l_ij = RECIP(max(params2.x, fabs(r-params1.y)));
-                            real u_ij = RECIP(rScaledRadiusI);
-                            real l_ij2 = l_ij*l_ij;
-                            real u_ij2 = u_ij*u_ij;
-                            real ratio = LOG(u_ij * RECIP(l_ij));
-                            real term = l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
-                                                (params1.y*params1.y*invR)*(l_ij2-u_ij2));
-                            term += (params2.x < params1.y-r ? 2.0f*(RECIP(params2.x)-l_ij) : 0);
-                            localData[tbx+tj].bornSum += term;
-                        }
-                    }
-                    tj = (tj + 1) & (TILE_SIZE - 1);
-                }
-            }
-        
-            // Write results.
-
-            atomicAdd(&global_bornSum[atom1], static_cast<unsigned long long>((long long) (bornSum*0x100000000)));
-#ifdef USE_CUTOFF
-            unsigned int atom2 = atomIndices[threadIdx.x];
-#else
-            unsigned int atom2 = y*TILE_SIZE + tgx;
-#endif
-            if (atom2 < PADDED_NUM_ATOMS)
-                atomicAdd(&global_bornSum[atom2], static_cast<unsigned long long>((long long) (localData[threadIdx.x].bornSum*0x100000000)));
-        }
-        pos++;
-    }
-}
-
-typedef struct {
-    real x, y, z;
-    real q;
-    real fx, fy, fz, fw;
-    real bornRadius;
-} AtomData2;
-
-/**
- * First part of computing the GBSA interaction.
- */
-
-extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ forceBuffers, unsigned long long* __restrict__ global_bornForce,
-        mixed* __restrict__ energyBuffer, const real4* __restrict__ posq, const real* __restrict__ charge, const real* __restrict__ global_bornRadii, bool needEnergy,
-#ifdef USE_CUTOFF
-        const int* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
-        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, const real4* __restrict__ blockCenter,
-        const real4* __restrict__ blockSize, const unsigned int* __restrict__ interactingAtoms,
-#else
-        unsigned int numTiles,
-#endif
-        const ushort2* __restrict__ exclusionTiles) {
-    const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
-    const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
-    const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
-    const unsigned int tbx = threadIdx.x - tgx;
-    mixed energy = 0;
-    __shared__ AtomData2 localData[FORCE_WORK_GROUP_SIZE];
-
-    // First loop: process tiles that contain exclusions.
-    
-    const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
-    const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
-    for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
-        const ushort2 tileIndices = exclusionTiles[pos];
-        const unsigned int x = tileIndices.x;
-        const unsigned int y = tileIndices.y;
-        real4 force = make_real4(0);
-        unsigned int atom1 = x*TILE_SIZE + tgx;
-        real4 posq1 = posq[atom1];
-        real charge1 = charge[atom1];
-        real bornRadius1 = global_bornRadii[atom1];
-        if (x == y) {
-            // This tile is on the diagonal.
-
-            localData[threadIdx.x].x = posq1.x;
-            localData[threadIdx.x].y = posq1.y;
-            localData[threadIdx.x].z = posq1.z;
-            localData[threadIdx.x].q = charge1;
-            localData[threadIdx.x].bornRadius = bornRadius1;
-            for (unsigned int j = 0; j < TILE_SIZE; j++) {
-                if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
-                    real3 pos2 = make_real3(localData[tbx+j].x, localData[tbx+j].y, localData[tbx+j].z);
-                    real charge2 = localData[tbx+j].q;
-                    real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
-#ifdef USE_PERIODIC
-                    APPLY_PERIODIC_TO_DELTA(delta)
-#endif
-                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-#ifdef USE_CUTOFF
-                    if (r2 < CUTOFF_SQUARED) {
-#endif
-                        real invR = RSQRT(r2);
-                        real r = r2*invR;
-                        real bornRadius2 = localData[tbx+j].bornRadius;
-                        real alpha2_ij = bornRadius1*bornRadius2;
-                        real D_ij = r2*RECIP(4.0f*alpha2_ij);
-                        real expTerm = EXP(-D_ij);
-                        real denominator2 = r2 + alpha2_ij*expTerm;
-                        real denominator = SQRT(denominator2);
-                        real scaledChargeProduct = PREFACTOR*charge1*charge2;
-                        real tempEnergy = scaledChargeProduct*RECIP(denominator);
-                        real Gpol = tempEnergy*RECIP(denominator2);
-                        real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
-                        real dEdR = Gpol*(1.0f - 0.25f*expTerm);
-                        force.w += dGpol_dalpha2_ij*bornRadius2;
-#ifdef USE_CUTOFF
-                        if (atom1 != y*TILE_SIZE+j)
-                            tempEnergy -= scaledChargeProduct/CUTOFF;
-#endif
-                        if (needEnergy)
-                            energy += 0.5f*tempEnergy;
-                        delta *= dEdR;
-                        force.x -= delta.x;
-                        force.y -= delta.y;
-                        force.z -= delta.z;
-#ifdef USE_CUTOFF
-                    }
-#endif
-                }
-            }
-        }
-        else {
-            // This is an off-diagonal tile.
-
-            unsigned int j = y*TILE_SIZE + tgx;
-            real4 tempPosq = posq[j];
-            localData[threadIdx.x].x = tempPosq.x;
-            localData[threadIdx.x].y = tempPosq.y;
-            localData[threadIdx.x].z = tempPosq.z;
-            localData[threadIdx.x].q = charge[j];
-            localData[threadIdx.x].bornRadius = global_bornRadii[j];
-            localData[threadIdx.x].fx = 0.0f;
-            localData[threadIdx.x].fy = 0.0f;
-            localData[threadIdx.x].fz = 0.0f;
-            localData[threadIdx.x].fw = 0.0f;
-            unsigned int tj = tgx;
-            for (j = 0; j < TILE_SIZE; j++) {
-                if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS) {
-                    real3 pos2 = make_real3(localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z);
-                    real charge2 = localData[tbx+tj].q;
-                    real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
-#ifdef USE_PERIODIC
-                    APPLY_PERIODIC_TO_DELTA(delta)
-#endif
-                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-#ifdef USE_CUTOFF
-                    if (r2 < CUTOFF_SQUARED) {
-#endif
-                        real invR = RSQRT(r2);
-                        real r = r2*invR;
-                        real bornRadius2 = localData[tbx+tj].bornRadius;
-                        real alpha2_ij = bornRadius1*bornRadius2;
-                        real D_ij = r2*RECIP(4.0f*alpha2_ij);
-                        real expTerm = EXP(-D_ij);
-                        real denominator2 = r2 + alpha2_ij*expTerm;
-                        real denominator = SQRT(denominator2);
-                        real scaledChargeProduct = PREFACTOR*charge1*charge2;
-                        real tempEnergy = scaledChargeProduct*RECIP(denominator);
-                        real Gpol = tempEnergy*RECIP(denominator2);
-                        real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
-                        real dEdR = Gpol*(1.0f - 0.25f*expTerm);
-                        force.w += dGpol_dalpha2_ij*bornRadius2;
-#ifdef USE_CUTOFF
-                        tempEnergy -= scaledChargeProduct/CUTOFF;
-#endif
-                        if (needEnergy)
-                            energy += tempEnergy;
-                        delta *= dEdR;
-                        force.x -= delta.x;
-                        force.y -= delta.y;
-                        force.z -= delta.z;
-                        localData[tbx+tj].fx += delta.x;
-                        localData[tbx+tj].fy += delta.y;
-                        localData[tbx+tj].fz += delta.z;
-                        localData[tbx+tj].fw += dGpol_dalpha2_ij*bornRadius1;
-#ifdef USE_CUTOFF
-                    }
-#endif
-                }
-                tj = (tj + 1) & (TILE_SIZE - 1);
-            }
-        }
-        
-        // Write results.
-        
-        unsigned int offset = x*TILE_SIZE + tgx;
-        atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
-        atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
-        atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
-        atomicAdd(&global_bornForce[offset], static_cast<unsigned long long>((long long) (force.w*0x100000000)));
-        if (x != y) {
-            offset = y*TILE_SIZE + tgx;
-            atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fx*0x100000000)));
-            atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fy*0x100000000)));
-            atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fz*0x100000000)));
-            atomicAdd(&global_bornForce[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fw*0x100000000)));
-        }
-    }
-
-    // Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
-    // of them (no cutoff).
-
-#ifdef USE_CUTOFF
-    unsigned int numTiles = interactionCount[0];
-    if (numTiles > maxTiles)
-        return; // There wasn't enough memory for the neighbor list.
-    int pos = (int) (warp*(numTiles > maxTiles ? NUM_BLOCKS*((long long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
-    int end = (int) ((warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
-#else
-    int pos = (int) (warp*(long long)numTiles/totalWarps);
-    int end = (int) ((warp+1)*(long long)numTiles/totalWarps);
-#endif
-    int skipBase = 0;
-    int currentSkipIndex = tbx;
-    __shared__ int atomIndices[FORCE_WORK_GROUP_SIZE];
-    __shared__ volatile int skipTiles[FORCE_WORK_GROUP_SIZE];
-    skipTiles[threadIdx.x] = -1;
-
-    while (pos < end) {
-        real4 force = make_real4(0);
-        bool includeTile = true;
-
-        // Extract the coordinates of this tile.
-        
-        int x, y;
-        bool singlePeriodicCopy = false;
-#ifdef USE_CUTOFF
-        x = tiles[pos];
-        real4 blockSizeX = blockSize[x];
-        singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= CUTOFF &&
-                              0.5f*periodicBoxSize.y-blockSizeX.y >= CUTOFF &&
-                              0.5f*periodicBoxSize.z-blockSizeX.z >= CUTOFF);
-#else
-        y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
-        x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-        if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
-            y += (x < y ? -1 : 1);
-            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-        }
-
-        // Skip over tiles that have exclusions, since they were already processed.
-
-        while (skipTiles[tbx+TILE_SIZE-1] < pos) {
-            if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
-                ushort2 tile = exclusionTiles[skipBase+tgx];
-                skipTiles[threadIdx.x] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
-            }
-            else
-                skipTiles[threadIdx.x] = end;
-            skipBase += TILE_SIZE;            
-            currentSkipIndex = tbx;
-        }
-        while (skipTiles[currentSkipIndex] < pos)
-            currentSkipIndex++;
-        includeTile = (skipTiles[currentSkipIndex] != pos);
-#endif
-        if (includeTile) {
-            unsigned int atom1 = x*TILE_SIZE + tgx;
-
-            // Load atom data for this tile.
-            
-            real4 posq1 = posq[atom1];
-            real charge1 = charge[atom1];
-            real bornRadius1 = global_bornRadii[atom1];
-#ifdef USE_CUTOFF
-            unsigned int j = interactingAtoms[pos*TILE_SIZE+tgx];
-#else
-            unsigned int j = y*TILE_SIZE + tgx;
-#endif
-            atomIndices[threadIdx.x] = j;
-            if (j < PADDED_NUM_ATOMS) {
-                real4 tempPosq = posq[j];
-                localData[threadIdx.x].x = tempPosq.x;
-                localData[threadIdx.x].y = tempPosq.y;
-                localData[threadIdx.x].z = tempPosq.z;
-                localData[threadIdx.x].q = charge[j];
-                localData[threadIdx.x].bornRadius = global_bornRadii[j];
-                localData[threadIdx.x].fx = 0.0f;
-                localData[threadIdx.x].fy = 0.0f;
-                localData[threadIdx.x].fz = 0.0f;
-                localData[threadIdx.x].fw = 0.0f;
-            }
-#ifdef USE_PERIODIC
-            if (singlePeriodicCopy) {
-                // The box is small enough that we can just translate all the atoms into a single periodic
-                // box, then skip having to apply periodic boundary conditions later.
-
-                real4 blockCenterX = blockCenter[x];
-                APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX)
-                APPLY_PERIODIC_TO_POS_WITH_CENTER(localData[threadIdx.x], blockCenterX)
-                unsigned int tj = tgx;
-                for (j = 0; j < TILE_SIZE; j++) {
-                    int atom2 = atomIndices[tbx+tj];
-                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
-                        real3 pos2 = make_real3(localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z);
-                        real charge2 = localData[tbx+tj].q;
-                        real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
-                        real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-                        if (r2 < CUTOFF_SQUARED) {
-                            real invR = RSQRT(r2);
-                            real r = r2*invR;
-                            real bornRadius2 = localData[tbx+tj].bornRadius;
-                            real alpha2_ij = bornRadius1*bornRadius2;
-                            real D_ij = r2*RECIP(4.0f*alpha2_ij);
-                            real expTerm = EXP(-D_ij);
-                            real denominator2 = r2 + alpha2_ij*expTerm;
-                            real denominator = SQRT(denominator2);
-                            real scaledChargeProduct = PREFACTOR*charge1*charge2;
-                            real tempEnergy = scaledChargeProduct*RECIP(denominator);
-                            real Gpol = tempEnergy*RECIP(denominator2);
-                            real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
-                            real dEdR = Gpol*(1.0f - 0.25f*expTerm);
-                            force.w += dGpol_dalpha2_ij*bornRadius2;
-#ifdef USE_CUTOFF
-                            tempEnergy -= scaledChargeProduct/CUTOFF;
-#endif
-                            if (needEnergy)
-                                energy += tempEnergy;
-                            delta *= dEdR;
-                            force.x -= delta.x;
-                            force.y -= delta.y;
-                            force.z -= delta.z;
-                            localData[tbx+tj].fx += delta.x;
-                            localData[tbx+tj].fy += delta.y;
-                            localData[tbx+tj].fz += delta.z;
-                            localData[tbx+tj].fw += dGpol_dalpha2_ij*bornRadius1;
-                        }
-                    }
-                    tj = (tj + 1) & (TILE_SIZE - 1);
-                }
-            }
-            else
-#endif
-            {
-                // We need to apply periodic boundary conditions separately for each interaction.
-
-                unsigned int tj = tgx;
-                for (j = 0; j < TILE_SIZE; j++) {
-                    int atom2 = atomIndices[tbx+tj];
-                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
-                        real3 pos2 = make_real3(localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z);
-                        real charge2 = localData[tbx+tj].q;
-                        real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
-#ifdef USE_PERIODIC
-                        APPLY_PERIODIC_TO_DELTA(delta)
-#endif
-                        real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-#ifdef USE_CUTOFF
-                        if (r2 < CUTOFF_SQUARED) {
-#endif
-                            real invR = RSQRT(r2);
-                            real r = r2*invR;
-                            real bornRadius2 = localData[tbx+tj].bornRadius;
-                            real alpha2_ij = bornRadius1*bornRadius2;
-                            real D_ij = r2*RECIP(4.0f*alpha2_ij);
-                            real expTerm = EXP(-D_ij);
-                            real denominator2 = r2 + alpha2_ij*expTerm;
-                            real denominator = SQRT(denominator2);
-                            real scaledChargeProduct = PREFACTOR*charge1*charge2;
-                            real tempEnergy = scaledChargeProduct*RECIP(denominator);
-                            real Gpol = tempEnergy*RECIP(denominator2);
-                            real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
-                            real dEdR = Gpol*(1.0f - 0.25f*expTerm);
-                            force.w += dGpol_dalpha2_ij*bornRadius2;
-#ifdef USE_CUTOFF
-                            tempEnergy -= scaledChargeProduct/CUTOFF;
-#endif
-                            if (needEnergy)
-                                energy += tempEnergy;
-                            delta *= dEdR;
-                            force.x -= delta.x;
-                            force.y -= delta.y;
-                            force.z -= delta.z;
-                            localData[tbx+tj].fx += delta.x;
-                            localData[tbx+tj].fy += delta.y;
-                            localData[tbx+tj].fz += delta.z;
-                            localData[tbx+tj].fw += dGpol_dalpha2_ij*bornRadius1;
-#ifdef USE_CUTOFF
-                        }
-#endif
-                    }
-                    tj = (tj + 1) & (TILE_SIZE - 1);
-                }
-            }
-
-            // Write results.
-
-            atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
-            atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
-            atomicAdd(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
-            atomicAdd(&global_bornForce[atom1], static_cast<unsigned long long>((long long) (force.w*0x100000000)));
-#ifdef USE_CUTOFF
-            unsigned int atom2 = atomIndices[threadIdx.x];
-#else
-            unsigned int atom2 = y*TILE_SIZE + tgx;
-#endif
-            if (atom2 < PADDED_NUM_ATOMS) {
-                atomicAdd(&forceBuffers[atom2], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fx*0x100000000)));
-                atomicAdd(&forceBuffers[atom2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fy*0x100000000)));
-                atomicAdd(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fz*0x100000000)));
-                atomicAdd(&global_bornForce[atom2], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fw*0x100000000)));
-            }
-        }
-        pos++;
-    }
-    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
-}
--- a/platforms/cuda/src/kernels/nonbondedExceptions.cu
+++ b/platforms/cuda/src/kernels/nonbondedExceptions.cu
 float4 exceptionParams = PARAMS[index];
 real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
+#if APPLY_PERIODIC
+APPLY_PERIODIC_TO_DELTA(delta)
+#endif
 real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
 real invR = RSQRT(r2);
 real sig2 = invR*exceptionParams.y;

--- a/platforms/cuda/src/kernels/noseHooverChain.cu
+++ b/platforms/cuda/src/kernels/noseHooverChain.cu
+
+#include <initializer_list>
+
+extern "C" __global__ void propagateNoseHooverChain(mixed2* __restrict__ chainData, const mixed2 * __restrict__ energySum, mixed2* __restrict__ scaleFactor,
+                                                    mixed* __restrict__ chainMasses, mixed* __restrict__ chainForces, 
+                                                    int chainType, int chainLength, int numMTS, int numDOFs, float timeStep,
+                                                    mixed kT, float frequency){
+    const mixed & kineticEnergy = chainType ? energySum[0].y : energySum[0].x;
+    mixed &scale = chainType ? scaleFactor[0].y : scaleFactor[0].x;
+    scale = (mixed) 1;
+    if(kineticEnergy < 1e-8) return;
+    for (int bead = 0; bead < chainLength; ++bead) chainMasses[bead] = kT / (frequency * frequency);
+    chainMasses[0] *= numDOFs;
+    mixed KE2 = 2.0f * kineticEnergy;
+    mixed timeOverMTS = timeStep / numMTS;
+    chainForces[0] = (KE2 - numDOFs * kT) / chainMasses[0];
+    for (int bead = 0; bead < chainLength - 1; ++bead) {
+        chainForces[bead + 1] = (chainMasses[bead] * chainData[bead].y * chainData[bead].y - kT) / chainMasses[bead + 1];
+    }
+    for (int mts = 0; mts < numMTS; ++mts) {
+        BEGIN_YS_LOOP
+            mixed wdt = ys * timeOverMTS;
+            chainData[chainLength-1].y += 0.25f * wdt * chainForces[chainLength-1];
+            for (int bead = chainLength - 2; bead >= 0; --bead) {
+                mixed aa = MIXEDEXP(-0.125f * wdt * chainData[bead + 1].y);
+                chainData[bead].y = aa * (chainData[bead].y * aa + 0.25f * wdt * chainForces[bead]);
+            }
+            // update particle velocities
+            mixed aa = MIXEDEXP(-0.5f * wdt * chainData[0].y);
+            scale *= aa;
+            // update the thermostat positions
+            for (int bead = 0; bead < chainLength; ++bead) {
+                chainData[bead].x += 0.5f * chainData[bead].y * wdt;
+            }
+            // update the forces
+            chainForces[0] = (scale * scale * KE2 - numDOFs * kT) / chainMasses[0];
+            // update thermostat velocities
+            for (int bead = 0; bead < chainLength - 1; ++bead) {
+                mixed aa = MIXEDEXP(-0.125f * wdt * chainData[bead + 1].y);
+                chainData[bead].y = aa * (aa * chainData[bead].y + 0.25f * wdt * chainForces[bead]);
+                chainForces[bead + 1] = (chainMasses[bead] * chainData[bead].y * chainData[bead].y - kT) / chainMasses[bead + 1];
+            }
+            chainData[chainLength-1].y += 0.25f * wdt * chainForces[chainLength-1];
+        END_YS_LOOP
+    } // MTS loop
+}
+
+
+/**
+ * Compute total (potential + kinetic) energy of the Nose-Hoover beads
+ */
+extern "C" __global__ void computeHeatBathEnergy(mixed* __restrict__ heatBathEnergy, int chainLength, int numDOFs,
+                                                 mixed kT, float frequency, const mixed2* __restrict__ chainData){
+    // Note that this is always incremented; make sure it's zeroed properly before the first call
+    mixed &energy = heatBathEnergy[0];
+
+    for(int i = 0; i < chainLength; ++i) {
+        mixed prefac = i ? 1 : numDOFs;
+        mixed mass = prefac * kT / (frequency * frequency);
+        mixed velocity = chainData[i].y; 
+        // The kinetic energy of this bead
+        energy += 0.5f * mass * velocity * velocity;
+        // The potential energy of this bead
+        mixed position = chainData[i].x;
+        energy += prefac * kT * position;
+    }
+}
+
+extern "C" __global__ void computeAtomsKineticEnergy(mixed2 * __restrict__ energyBuffer, int numAtoms,
+                                                     const mixed4* __restrict__ velm, const int *__restrict__ atoms){
+    mixed2 energy = make_mixed2(0,0);
+    //energy = 1; return;
+    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
+        int atom = atoms[index];
+        mixed4 v = velm[atom];
+        mixed mass = v.w == 0 ? 0 : 1 / v.w;
+        energy.x += 0.5f * mass * (v.x*v.x + v.y*v.y + v.z*v.z);
+    }
+    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] = energy;
+}
+
+extern "C" __global__ void computePairsKineticEnergy(mixed2 * __restrict__ energyBuffer, int numPairs,
+                                                     const mixed4* __restrict__ velm, const int2 *__restrict__ pairs){
+    mixed2 energy = make_mixed2(0,0);
+    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numPairs; index += blockDim.x*gridDim.x) {
+        int2 pair = pairs[index];
+        int atom1 = pair.x;
+        int atom2 = pair.y;
+        mixed4 v1 = velm[atom1];
+        mixed4 v2 = velm[atom2];
+        mixed m1 = v1.w == 0 ? 0 : 1 / v1.w;
+        mixed m2 = v2.w == 0 ? 0 : 1 / v2.w;
+        mixed4 cv;
+        cv.x = (m1*v1.x + m2*v2.x) / (m1 + m2);
+        cv.y = (m1*v1.y + m2*v2.y) / (m1 + m2);
+        cv.z = (m1*v1.z + m2*v2.z) / (m1 + m2);
+        mixed4 rv;
+        rv.x = v2.x - v1.x;
+        rv.y = v2.y - v1.y;
+        rv.z = v2.z - v1.z;
+        energy.x += 0.5f * (m1 + m2) * (cv.x*cv.x + cv.y*cv.y + cv.z*cv.z);
+        energy.y += 0.5f * (m1 * m2 / (m1 + m2)) * (rv.x*rv.x + rv.y*rv.y + rv.z*rv.z);
+    }
+    // The atoms version of this has been called already, so accumulate instead of assigning here
+    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x].x += energy.x;
+    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x].y += energy.y;
+}
+
+extern "C" __global__ void scaleAtomsVelocities(mixed2* __restrict__ scaleFactor, int numAtoms,
+                                                mixed4* __restrict__ velm, const int *__restrict__ atoms){
+    const mixed &scale = scaleFactor[0].x;
+    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
+        int atom = atoms[index];
+        mixed4 &v = velm[atom];
+        v.x *= scale;
+        v.y *= scale;
+        v.z *= scale;
+    }
+}
+
+extern "C" __global__ void scalePairsVelocities(mixed2 * __restrict__ scaleFactor, int numPairs,
+                                                mixed4* __restrict__ velm, const int2 *__restrict__ pairs){
+    const mixed &absScale = scaleFactor[0].x;
+    const mixed &relScale = scaleFactor[0].y;
+    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numPairs; index += blockDim.x*gridDim.x) {
+        int atom1 = pairs[index].x;
+        int atom2 = pairs[index].y;
+        mixed4 v1 = velm[atom1];
+        mixed4 v2 = velm[atom2];
+        mixed m1 = v1.w == 0 ? 0 : 1 / v1.w;
+        mixed m2 = v2.w == 0 ? 0 : 1 / v2.w;
+        mixed4 cv;
+        cv.x = (m1*v1.x + m2*v2.x) / (m1 + m2);
+        cv.y = (m1*v1.y + m2*v2.y) / (m1 + m2);
+        cv.z = (m1*v1.z + m2*v2.z) / (m1 + m2);
+        mixed4 rv;
+        rv.x = v2.x - v1.x;
+        rv.y = v2.y - v1.y;
+        rv.z = v2.z - v1.z;
+        v1.x = absScale * cv.x - relScale * rv.x * m2 / (m1 + m2);
+        v1.y = absScale * cv.y - relScale * rv.y * m2 / (m1 + m2);
+        v1.z = absScale * cv.z - relScale * rv.z * m2 / (m1 + m2);
+        v2.x = absScale * cv.x + relScale * rv.x * m1 / (m1 + m2);
+        v2.y = absScale * cv.y + relScale * rv.y * m1 / (m1 + m2);
+        v2.z = absScale * cv.z + relScale * rv.z * m1 / (m1 + m2);
+        velm[atom1] = v1;
+        velm[atom2] = v2;
+    }
+}
+
+/**
+ * Sum the energy buffer containing a pair of energies stored as mixed2.  This is copied from utilities.cu with small modifications
+ */
+extern "C" __global__ void reduceEnergyPair(const mixed2* __restrict__ energyBuffer, mixed2* __restrict__ result, int bufferSize, int workGroupSize) {
+    __shared__ mixed2 tempBuffer[WORK_GROUP_SIZE];
+    const unsigned int thread = threadIdx.x;
+    mixed2 sum = make_mixed2(0,0);
+    for (unsigned int idx = thread; idx < bufferSize; idx += blockDim.x) {
+        sum.x += energyBuffer[idx].x;
+        sum.y += energyBuffer[idx].y;
+    }
+    tempBuffer[thread] = sum;
+    for (int i = 1; i < workGroupSize; i *= 2) {
+        __syncthreads();
+        if (thread%(i*2) == 0 && thread+i < workGroupSize) {
+            tempBuffer[thread].x += tempBuffer[thread+i].x;
+            tempBuffer[thread].y += tempBuffer[thread+i].y;
+        }
+    }
+    if (thread == 0)
+        *result = tempBuffer[0];
+}
--- a/platforms/cuda/src/kernels/removeCM.cu
+++ b/platforms/cuda/src/kernels/removeCM.cu
-/**
- * Calculate the center of mass momentum.
- */
-
-extern "C" __global__ void calcCenterOfMassMomentum(int numAtoms, const mixed4* __restrict__ velm, float4* __restrict__ cmMomentum) {
-    extern __shared__ volatile float3 temp[];
-    float3 cm = make_float3(0, 0, 0);
-    for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
-        mixed4 velocity = velm[index];
-        if (velocity.w != 0) {
-            mixed mass = RECIP(velocity.w);
-            cm.x += (float) (velocity.x*mass);
-            cm.y += (float) (velocity.y*mass);
-            cm.z += (float) (velocity.z*mass);
-        }
-    }
-
-    // Sum the threads in this group.
-
-    int thread = threadIdx.x;
-    temp[thread].x = cm.x;
-    temp[thread].y = cm.y;
-    temp[thread].z = cm.z;
-    __syncthreads();
-    if (thread < 32) {
-        temp[thread].x += temp[thread+32].x;
-        temp[thread].y += temp[thread+32].y;
-        temp[thread].z += temp[thread+32].z;
-        if (thread < 16) {
-            temp[thread].x += temp[thread+16].x;
-            temp[thread].y += temp[thread+16].y;
-            temp[thread].z += temp[thread+16].z;
-        }
-        if (thread < 8) {
-            temp[thread].x += temp[thread+8].x;
-            temp[thread].y += temp[thread+8].y;
-            temp[thread].z += temp[thread+8].z;
-        }
-        if (thread < 4) {
-            temp[thread].x += temp[thread+4].x;
-            temp[thread].y += temp[thread+4].y;
-            temp[thread].z += temp[thread+4].z;
-        }
-        if (thread < 2) {
-            temp[thread].x += temp[thread+2].x;
-            temp[thread].y += temp[thread+2].y;
-            temp[thread].z += temp[thread+2].z;
-        }
-    }
-    if (thread == 0) {
-        float3 sum = make_float3(temp[thread].x+temp[thread+1].x, temp[thread].y+temp[thread+1].y, temp[thread].z+temp[thread+1].z);
-        cmMomentum[blockIdx.x] = make_float4(sum.x, sum.y, sum.z, 0.0f);
-    }
-}
-
-/**
- * Remove center of mass motion.
- */
-
-extern "C" __global__ void removeCenterOfMassMomentum(unsigned int numAtoms, mixed4* __restrict__ velm, const float4* __restrict__ cmMomentum) {
-    // First sum all of the momenta that were calculated by individual groups.
-
-    extern volatile float3 temp[];
-    float3 cm = make_float3(0, 0, 0);
-    for (unsigned int index = threadIdx.x; index < gridDim.x; index += blockDim.x) {
-        float4 momentum = cmMomentum[index];
-        cm.x += momentum.x;
-        cm.y += momentum.y;
-        cm.z += momentum.z;
-    }
-    int thread = threadIdx.x;
-    temp[thread].x = cm.x;
-    temp[thread].y = cm.y;
-    temp[thread].z = cm.z;
-    __syncthreads();
-    if (thread < 32) {
-        temp[thread].x += temp[thread+32].x;
-        temp[thread].y += temp[thread+32].y;
-        temp[thread].z += temp[thread+32].z;
-        if (thread < 16) {
-            temp[thread].x += temp[thread+16].x;
-            temp[thread].y += temp[thread+16].y;
-            temp[thread].z += temp[thread+16].z;
-        }
-        if (thread < 8) {
-            temp[thread].x += temp[thread+8].x;
-            temp[thread].y += temp[thread+8].y;
-            temp[thread].z += temp[thread+8].z;
-        }
-        if (thread < 4) {
-            temp[thread].x += temp[thread+4].x;
-            temp[thread].y += temp[thread+4].y;
-            temp[thread].z += temp[thread+4].z;
-        }
-        if (thread < 2) {
-            temp[thread].x += temp[thread+2].x;
-            temp[thread].y += temp[thread+2].y;
-            temp[thread].z += temp[thread+2].z;
-        }
-    }
-    __syncthreads();
-    cm = make_float3(INVERSE_TOTAL_MASS*(temp[0].x+temp[1].x), INVERSE_TOTAL_MASS*(temp[0].y+temp[1].y), INVERSE_TOTAL_MASS*(temp[0].z+temp[1].z));
-
-    // Now remove the center of mass velocity from each atom.
-
-    for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
-        mixed4 velocity = velm[index];
-        velocity.x -= cm.x;
-        velocity.y -= cm.y;
-        velocity.z -= cm.z;
-        velm[index] = velocity;
-    }
-}
--- a/platforms/cuda/src/kernels/utilities.cu
+++ b/platforms/cuda/src/kernels/utilities.cu
@@ -99,4 +99,4 @@ __global__ void setCharges(real* __restrict__ charges, real4* __restrict__ posq,
    for (int i = blockDim.x*blockIdx.x+threadIdx.x; i < numAtoms; i += blockDim.x*gridDim.x)
        posq[i].w = charges[atomOrder[i]];
 }
-}
\ No newline at end of file
+}
--- a/platforms/cuda/src/kernels/vectorOps.cu
+++ b/platforms/cuda/src/kernels/vectorOps.cu
@@ -554,16 +554,16 @@ inline __device__ float3 cross(float3 a, float3 b) {
    return make_float3(a.y*b.z-a.z*b.y, a.z*b.x-a.x*b.z, a.x*b.y-a.y*b.x);
 }

-inline __device__ float3 cross(float4 a, float4 b) {
-    return make_float3(a.y*b.z-a.z*b.y, a.z*b.x-a.x*b.z, a.x*b.y-a.y*b.x);
+inline __device__ float4 cross(float4 a, float4 b) {
+    return make_float4(a.y*b.z-a.z*b.y, a.z*b.x-a.x*b.z, a.x*b.y-a.y*b.x, 0.0f);
 }

 inline __device__ double3 cross(double3 a, double3 b) {
    return make_double3(a.y*b.z-a.z*b.y, a.z*b.x-a.x*b.z, a.x*b.y-a.y*b.x);
 }

-inline __device__ double3 cross(double4 a, double4 b) {
-    return make_double3(a.y*b.z-a.z*b.y, a.z*b.x-a.x*b.z, a.x*b.y-a.y*b.x);
+inline __device__ double4 cross(double4 a, double4 b) {
+    return make_double4(a.y*b.z-a.z*b.y, a.z*b.x-a.x*b.z, a.x*b.y-a.y*b.x, 0.0);
 }

 // Normalize a vector
@@ -594,6 +594,14 @@ inline __device__ double4 normalize(double4 a) {

 // Strip off the fourth component of a vector.

+inline __device__ short3 trimTo3(short4 v) {
+    return make_short3(v.x, v.y, v.z);
+}
+
+inline __device__ int3 trimTo3(int4 v) {
+    return make_int3(v.x, v.y, v.z);
+}
+
 inline __device__ float3 trimTo3(float4 v) {
    return make_float3(v.x, v.y, v.z);
 }

--- a/platforms/cuda/src/kernels/velocityVerlet.cu
+++ b/platforms/cuda/src/kernels/velocityVerlet.cu
+/**
+ * Perform the first step of Velocity Verlet integration.
+ * 
+ * update displacements (posDelta) and velocities (velm)
+ */
+
+extern "C" __global__ void integrateVelocityVerletPart1(int numAtoms, int numPairs, int paddedNumAtoms, const mixed2* __restrict__ dt, const real4* __restrict__ posq,
+        const real4* __restrict__ posqCorrection, mixed4* __restrict__ velm, const long long* __restrict__ force, mixed4* __restrict__ posDelta,
+        const int* __restrict__ atomList, const int2* __restrict__ pairList) {
+    const mixed2 stepSize = dt[0];
+    const mixed dtPos = stepSize.y;
+    const mixed dtVel = 0.5f*(stepSize.x+stepSize.y);
+    const mixed scale = 0.5f*dtVel/(mixed) 0x100000000;
+    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
+        int atom = atomList[index];
+        mixed4 velocity = velm[atom];
+        if (velocity.w != 0.0) {
+#ifdef USE_MIXED_PRECISION
+            real4 pos1 = posq[atom];
+            real4 pos2 = posqCorrection[atom];
+            mixed4 pos = make_mixed4(pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
+#else
+            real4 pos = posq[atom];
+#endif
+            velocity.x += scale*force[atom]*velocity.w;
+            velocity.y += scale*force[atom+paddedNumAtoms]*velocity.w;
+            velocity.z += scale*force[atom+paddedNumAtoms*2]*velocity.w;
+            pos.x = velocity.x*dtPos;
+            pos.y = velocity.y*dtPos;
+            pos.z = velocity.z*dtPos;
+            posDelta[atom] = pos;
+            velm[atom] = velocity;
+        }
+    }
+    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numPairs; index += blockDim.x*gridDim.x) {
+        int atom1 = pairList[index].x;
+        int atom2 = pairList[index].y;
+        mixed4 v1 = velm[atom1];
+        mixed4 v2 = velm[atom2];
+        mixed m1 = v1.w == 0.0f ? 0.0f : 1.0f / v1.w;
+        mixed m2 = v2.w == 0.0f ? 0.0f : 1.0f / v2.w;
+        mixed mass1fract = m1 / (m1 + m2);
+        mixed mass2fract = m2 / (m1 + m2);
+        mixed invRedMass = (m1 * m2 != 0.0f) ? (m1 + m2)/(m1 * m2) : 0.0f;
+        mixed invTotMass = (m1 + m2 != 0.0f) ? 1.0f /(m1 + m2) : 0.0f;
+        mixed3 comVel;
+        comVel.x= v1.x*mass1fract + v2.x*mass2fract;
+        comVel.y= v1.y*mass1fract + v2.y*mass2fract;
+        comVel.z= v1.z*mass1fract + v2.z*mass2fract;
+        mixed3 relVel;
+        relVel.x= v2.x - v1.x;
+        relVel.y= v2.y - v1.y;
+        relVel.z= v2.z - v1.z;
+        //
+        mixed3 comFrc;
+        comFrc.x = force[atom1] + force[atom2];
+        comFrc.y = force[atom1 + paddedNumAtoms] + force[atom2 + paddedNumAtoms];
+        comFrc.z = force[atom1 + paddedNumAtoms*2] + force[atom2 + paddedNumAtoms*2];
+        mixed3 relFrc;
+        relFrc.x = mass1fract*force[atom2] - mass2fract*force[atom1];
+        relFrc.y = mass1fract*force[atom2+paddedNumAtoms] - mass2fract*force[atom1+paddedNumAtoms];
+        relFrc.z = mass1fract*force[atom2+paddedNumAtoms*2] - mass2fract*force[atom1+paddedNumAtoms*2];
+        comVel.x += comFrc.x * scale * invTotMass;
+        comVel.y += comFrc.y * scale * invTotMass;
+        comVel.z += comFrc.z * scale * invTotMass;
+        relVel.x += relFrc.x * scale * invRedMass;
+        relVel.y += relFrc.y * scale * invRedMass;
+        relVel.z += relFrc.z * scale * invRedMass;
+#ifdef USE_MIXED_PRECISION
+        real4 posv1 = posq[atom1];
+        real4 posv2 = posq[atom2];
+        real4 posc1 = posqCorrection[atom1];
+        real4 posc2 = posqCorrection[atom2];
+        mixed4 pos1 = make_mixed4(posv1.x+(mixed)posc1.x, posv1.y+(mixed)posc1.y, posv1.z+(mixed)posc1.z, posv1.w);
+        mixed4 pos2 = make_mixed4(posv2.x+(mixed)posc2.x, posv2.y+(mixed)posc2.y, posv2.z+(mixed)posc2.z, posv2.w);
+#else
+        real4 pos1 = posq[atom1];
+        real4 pos2 = posq[atom2];
+#endif
+        if (v1.w != 0.0f) {
+            v1.x = comVel.x - relVel.x*mass2fract;
+            v1.y = comVel.y - relVel.y*mass2fract;
+            v1.z = comVel.z - relVel.z*mass2fract;
+            pos1.x = v1.x*dtPos;
+            pos1.y = v1.y*dtPos;
+            pos1.z = v1.z*dtPos;
+            posDelta[atom1] = pos1;
+            velm[atom1] = v1;
+        }
+        if (v2.w != 0.0f) {
+            v2.x = comVel.x + relVel.x*mass1fract;
+            v2.y = comVel.y + relVel.y*mass1fract;
+            v2.z = comVel.z + relVel.z*mass1fract;
+            pos2.x = v2.x*dtPos;
+            pos2.y = v2.y*dtPos;
+            pos2.z = v2.z*dtPos;
+            posDelta[atom2] = pos2;
+            velm[atom2] = v2;
+        }
+    }
+}
+
+/**
+ * Perform the second step of Velocity Verlet integration.
+ * 
+ * apply displacements to positions (posq) after constraints have been enforced
+ */
+
+extern "C" __global__ void integrateVelocityVerletPart2(int numAtoms, mixed2* __restrict__ dt, real4* __restrict__ posq,
+        real4* __restrict__ posqCorrection, mixed4* __restrict__ velm, const mixed4* __restrict__ posDelta) {
+    mixed2 stepSize = dt[0];
+    int index = blockIdx.x*blockDim.x+threadIdx.x;
+    if (index == 0)
+        dt[0].x = stepSize.y;
+    for (; index < numAtoms; index += blockDim.x*gridDim.x) {
+        mixed4 velocity = velm[index];
+        if (velocity.w != 0.0) {
+#ifdef USE_MIXED_PRECISION
+            real4 pos1 = posq[index];
+            real4 pos2 = posqCorrection[index];
+            mixed4 pos = make_mixed4(pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
+#else
+            real4 pos = posq[index];
+#endif
+            mixed4 delta = posDelta[index];
+            pos.x += delta.x;
+            pos.y += delta.y;
+            pos.z += delta.z;
+#ifdef USE_MIXED_PRECISION
+            posq[index] = make_real4((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w);
+            posqCorrection[index] = make_real4(pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
+#else
+            posq[index] = pos;
+#endif
+        }
+    }
+}
+
+
+/**
+ * Perform the third step of Velocity Verlet integration.
+ *
+ * modify the velocities (velm) after the force update
+ */
+
+extern "C" __global__ void integrateVelocityVerletPart3(int numAtoms, int numPairs, int paddedNumAtoms, mixed2* __restrict__ dt, real4* __restrict__ posq,
+        real4* __restrict__ posqCorrection, mixed4* __restrict__ velm,  const long long* __restrict__ force, const mixed4* __restrict__ posDelta,
+        const int* __restrict__ atomList, const int2* __restrict__ pairList) {
+    mixed2 stepSize = dt[0];
+#if __CUDA_ARCH__ >= 130
+    double oneOverDt = 1.0/stepSize.y;
+#else
+    float oneOverDt = 1.0f/stepSize.y;
+    float correction = (1.0f-oneOverDt*stepSize.y)/stepSize.y;
+#endif
+    const mixed dtVel = 0.5f*(stepSize.x+stepSize.y);
+    const mixed scale = 0.5f*dtVel/(mixed) 0x100000000;
+    int index = blockIdx.x*blockDim.x+threadIdx.x;
+    if (index == 0)
+        dt[0].x = stepSize.y;
+
+    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
+        int atom = atomList[index];
+        mixed4 velocity = velm[atom];
+        if (velocity.w != 0.0) {
+            mixed4 deltaXconstrained = posDelta[atom];
+            velocity.x += scale*force[atom]*velocity.w + (deltaXconstrained.x - velocity.x*stepSize.y)*oneOverDt;
+            velocity.y += scale*force[atom+paddedNumAtoms]*velocity.w + (deltaXconstrained.y - velocity.y*stepSize.y)*oneOverDt;
+            velocity.z += scale*force[atom+paddedNumAtoms*2]*velocity.w + (deltaXconstrained.z - velocity.z*stepSize.y)*oneOverDt;
+#if __CUDA_ARCH__ < 130
+            velocity.x += (deltaXconstrained.x - velocity.x*stepSize.y)*correction;
+            velocity.y += (deltaXconstrained.y - velocity.y*stepSize.y)*correction;
+            velocity.z += (deltaXconstrained.z - velocity.z*stepSize.y)*correction;
+#endif
+            velm[atom] = velocity;
+        }
+    }
+    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numPairs; index += blockDim.x*gridDim.x) {
+        int atom1 = pairList[index].x;
+        int atom2 = pairList[index].y;
+        mixed4 v1 = velm[atom1];
+        mixed4 v2 = velm[atom2];
+        mixed m1 = v1.w == 0.0f ? 0.0f : 1.0f / v1.w;
+        mixed m2 = v2.w == 0.0f ? 0.0f : 1.0f / v2.w;
+        mixed mass1fract = m1 / (m1 + m2);
+        mixed mass2fract = m2 / (m1 + m2);
+        mixed invRedMass = (m1 * m2 != 0.0f) ? (m1 + m2)/(m1 * m2) : 0.0f;
+        mixed invTotMass = (m1 + m2 != 0.0f) ? 1.0f /(m1 + m2) : 0.0f;
+        mixed3 comVel;
+        comVel.x= v1.x*mass1fract + v2.x*mass2fract;
+        comVel.y= v1.y*mass1fract + v2.y*mass2fract;
+        comVel.z= v1.z*mass1fract + v2.z*mass2fract;
+        mixed3 relVel;
+        relVel.x= v2.x - v1.x;
+        relVel.y= v2.y - v1.y;
+        relVel.z= v2.z - v1.z;
+        //
+        mixed3 comFrc;
+        comFrc.x = force[atom1] + force[atom2];
+        comFrc.y = force[atom1 + paddedNumAtoms] + force[atom2 + paddedNumAtoms];
+        comFrc.z = force[atom1 + paddedNumAtoms*2] + force[atom2 + paddedNumAtoms*2];
+        mixed3 relFrc;
+        relFrc.x = mass1fract*force[atom2] - mass2fract*force[atom1];
+        relFrc.y = mass1fract*force[atom2+paddedNumAtoms] - mass2fract*force[atom1+paddedNumAtoms];
+        relFrc.z = mass1fract*force[atom2+paddedNumAtoms*2] - mass2fract*force[atom1+paddedNumAtoms*2];
+        comVel.x += comFrc.x * scale * invTotMass;
+        comVel.y += comFrc.y * scale * invTotMass;
+        comVel.z += comFrc.z * scale * invTotMass;
+        relVel.x += relFrc.x * scale * invRedMass;
+        relVel.y += relFrc.y * scale * invRedMass;
+        relVel.z += relFrc.z * scale * invRedMass;
+        if (v1.w != 0.0f) {
+            mixed4 deltaXconstrained = posDelta[atom1];
+            v1.x = comVel.x - relVel.x*mass2fract + (deltaXconstrained.x - v1.x*stepSize.y)*oneOverDt;
+            v1.y = comVel.y - relVel.y*mass2fract + (deltaXconstrained.y - v1.y*stepSize.y)*oneOverDt;
+            v1.z = comVel.z - relVel.z*mass2fract + (deltaXconstrained.z - v1.z*stepSize.y)*oneOverDt;
+#if __CUDA_ARCH__ < 130
+            v1.x += (deltaXconstrained.x - v1.x*stepSize.y)*correction;
+            v1.y += (deltaXconstrained.y - v1.y*stepSize.y)*correction;
+            v1.z += (deltaXconstrained.z - v1.z*stepSize.y)*correction;
+#endif
+            velm[atom1] = v1;
+        }
+        if (v2.w != 0.0f) {
+            mixed4 deltaXconstrained = posDelta[atom2];
+            v2.x = comVel.x + relVel.x*mass1fract + (deltaXconstrained.x - v2.x*stepSize.y)*oneOverDt;
+            v2.y = comVel.y + relVel.y*mass1fract + (deltaXconstrained.y - v2.y*stepSize.y)*oneOverDt;
+            v2.z = comVel.z + relVel.z*mass1fract + (deltaXconstrained.z - v2.z*stepSize.y)*oneOverDt;
+#if __CUDA_ARCH__ < 130
+            v2.x += (deltaXconstrained.x - v2.x*stepSize.y)*correction;
+            v2.y += (deltaXconstrained.y - v2.y*stepSize.y)*correction;
+            v2.z += (deltaXconstrained.z - v2.z*stepSize.y)*correction;
+#endif
+            velm[atom2] = v2;
+        }
+    }
+}
+
+
+
+/**
+ * Apply the hard wall constraint 
+ */
+
+extern "C" __global__ void integrateVelocityVerletHardWall(int numPairs, const float* __restrict__ maxPairDistance, mixed2* __restrict__ dt, real4* __restrict__ posq,
+        real4* __restrict__ posqCorrection, mixed4* __restrict__ velm, 
+        const int2* __restrict__ pairList, const float* __restrict__ pairTemperature) {
+    mixed dtPos = dt[0].y;
+    mixed maxDelta = (mixed) maxPairDistance[0];
+    // Apply hard wall constraints.
+    if (maxDelta > 0) {
+        for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numPairs; index += blockDim.x*gridDim.x) {
+            const mixed hardWallScale = sqrt( ((mixed) pairTemperature[index]) * ((mixed) BOLTZ));
+            int2 atom = make_int2(pairList[index].x, pairList[index].y);
+#ifdef USE_MIXED_PRECISION
+            real4 posv1 = posq[atom.x];
+            real4 posc1 = posqCorrection[atom.x];
+            mixed4 pos1 = make_mixed4(posv1.x+(mixed)posc1.x, posv1.y+(mixed)posc1.y, posv1.z+(mixed)posc1.z, posv1.w);
+            real4 posv2 = posq[atom.y];
+            real4 posc2 = posqCorrection[atom.y];
+            mixed4 pos2 = make_mixed4(posv2.x+(mixed)posc2.x, posv2.y+(mixed)posc2.y, posv2.z+(mixed)posc2.z, posv2.w);
+#else
+            real4 pos1 = posq[atom.x];
+            real4 pos2 = posq[atom.y];
+#endif
+            mixed3 delta = make_mixed3(
+                mixed (pos1.x - pos2.x),
+                mixed (pos1.y - pos2.y),
+                mixed (pos1.z - pos2.z)
+            );
+            mixed r = sqrt(delta.x*delta.x + delta.y*delta.y + delta.z*delta.z);
+            mixed rInv = 1/r;
+            if (rInv*maxDelta < 1.0) {
+                // The constraint has been violated, so make the inter-particle distance "bounce"
+                // off the hard wall.
+                mixed3 bondDir = make_mixed3(delta.x * rInv, delta.y * rInv, delta.z * rInv);
+                mixed3 vel1 = make_mixed3(velm[atom.x].x, velm[atom.x].y, velm[atom.x].z);
+                mixed3 vel2 = make_mixed3(velm[atom.y].x, velm[atom.y].y, velm[atom.y].z);
+                mixed m1 = velm[atom.x].w != 0.0 ? 1.0/velm[atom.x].w : 0.0;
+                mixed m2 = velm[atom.y].w != 0.0 ? 1.0/velm[atom.y].w : 0.0;
+                mixed invTotMass = (m1 + m2 != 0.0) ? 1.0 /(m1 + m2) : 0.0;
+                mixed deltaR = r-maxDelta;
+                mixed deltaT = dtPos;
+                mixed dt = dtPos;
+
+                mixed dotvr1 = vel1.x*bondDir.x + vel1.y*bondDir.y + vel1.z*bondDir.z;
+                mixed3 vb1 = make_mixed3(bondDir.x*dotvr1, bondDir.y*dotvr1, bondDir.z*dotvr1);
+                mixed3 vp1 = make_mixed3(vel1.x-vb1.x, vel1.y-vb1.y, vel1.z-vb1.z);
+                if (m2 == 0) {
+                    // The parent particle is massless, so move only the Drude particle.
+
+                    if (dotvr1 != 0.0)
+                        deltaT = deltaR/fabs(dotvr1);
+                    if (deltaT > dtPos)
+                        deltaT = dtPos;
+                    dotvr1 = -dotvr1*hardWallScale/(fabs(dotvr1)*sqrt(m1));
+                    mixed dr = -deltaR + deltaT*dotvr1;
+                    pos1.x += bondDir.x*dr;
+                    pos1.y += bondDir.y*dr;
+                    pos1.z += bondDir.z*dr;
+                    velm[atom.x] = make_mixed4(vp1.x + bondDir.x*dotvr1, vp1.y + bondDir.y*dotvr1, vp1.z + bondDir.z*dotvr1, velm[atom.x].w);
+#ifdef USE_MIXED_PRECISION
+                    posq[atom.x] = make_real4((real) pos1.x, (real) pos1.y, (real) pos1.z, (real) pos1.w);
+                    posqCorrection[atom.x] = make_real4(pos1.x-(real) pos1.x, pos1.y-(real) pos1.y, pos1.z-(real) pos1.z, 0);
+#else
+                    posq[atom.x] = pos1;
+#endif
+                }
+                else {
+                    // Move both particles.
+                    mixed dotvr2 = vel2.x*bondDir.x + vel2.y*bondDir.y + vel2.z*bondDir.z;
+                    mixed3 vb2 = make_mixed3(bondDir.x*dotvr2, bondDir.y*dotvr2, bondDir.z*dotvr2);
+                    mixed3 vp2 = make_mixed3(vel2.x-vb2.x, vel2.y-vb2.y, vel2.z-vb2.z);
+                    mixed vbCMass = (m1*dotvr1 + m2*dotvr2)*invTotMass;
+                    dotvr1 -= vbCMass;
+                    dotvr2 -= vbCMass;
+                    if (dotvr1 != dotvr2)
+                        deltaT = deltaR/fabs(dotvr1-dotvr2);
+                    if (deltaT > dt)
+                        deltaT = dt;
+                    mixed vBond = hardWallScale/sqrt(m1);
+                    dotvr1 = -dotvr1*vBond*m2*invTotMass/fabs(dotvr1);
+                    dotvr2 = -dotvr2*vBond*m1*invTotMass/fabs(dotvr2);
+                    mixed dr1 = -deltaR*m2*invTotMass + deltaT*dotvr1;
+                    mixed dr2 = deltaR*m1*invTotMass + deltaT*dotvr2;
+                    dotvr1 += vbCMass;
+                    dotvr2 += vbCMass;
+                    pos1.x += bondDir.x*dr1;
+                    pos1.y += bondDir.y*dr1;
+                    pos1.z += bondDir.z*dr1;
+                    pos2.x += bondDir.x*dr2;
+                    pos2.y += bondDir.y*dr2;
+                    pos2.z += bondDir.z*dr2;
+                    velm[atom.x] = make_mixed4(vp1.x + bondDir.x*dotvr1, vp1.y + bondDir.y*dotvr1, vp1.z + bondDir.z*dotvr1, velm[atom.x].w);
+                    velm[atom.y] = make_mixed4(vp2.x + bondDir.x*dotvr2, vp2.y + bondDir.y*dotvr2, vp2.z + bondDir.z*dotvr2, velm[atom.y].w);
+#ifdef USE_MIXED_PRECISION
+                    posq[atom.x] = make_real4((real) pos1.x, (real) pos1.y, (real) pos1.z, (real) pos1.w);
+                    posq[atom.y] = make_real4((real) pos2.x, (real) pos2.y, (real) pos2.z, (real) pos2.w);
+                    posqCorrection[atom.x] = make_real4(pos1.x-(real) pos1.x, pos1.y-(real) pos1.y, pos1.z-(real) pos1.z, 0);
+                    posqCorrection[atom.y] = make_real4(pos2.x-(real) pos2.x, pos2.y-(real) pos2.y, pos2.z-(real) pos2.z, 0);
+#else
+                    posq[atom.x] = pos1;
+                    posq[atom.y] = pos2;
+#endif
+                }
+            }
+        }
+    } /* end of hard wall constraint part */
+}
--- a/platforms/cuda/staticTarget/CMakeLists.txt
+++ b/platforms/cuda/staticTarget/CMakeLists.txt
@@ -4,17 +4,17 @@
 INCLUDE(FindCUDA)
 INCLUDE_DIRECTORIES(${CUDA_TOOLKIT_INCLUDE})

-FILE(GLOB CUDA_KERNELS ${CUDA_SOURCE_DIR}/kernels/*.cu)
-ADD_CUSTOM_COMMAND(OUTPUT ${CUDA_KERNELS_CPP} ${CUDA_KERNELS_H}
+FILE(GLOB CUDA_KERNELS ${KERNEL_SOURCE_DIR}/kernels/*.cu)
+ADD_CUSTOM_COMMAND(OUTPUT ${KERNELS_CPP} ${KERNELS_H}
    COMMAND ${CMAKE_COMMAND}
-    ARGS -D CUDA_SOURCE_DIR=${CUDA_SOURCE_DIR} -D CUDA_KERNELS_CPP=${CUDA_KERNELS_CPP} -D CUDA_KERNELS_H=${CUDA_KERNELS_H} -D CUDA_SOURCE_CLASS=${CUDA_SOURCE_CLASS} -P ${CMAKE_CURRENT_SOURCE_DIR}/../EncodeCUDAFiles.cmake
+    ARGS -D KERNEL_SOURCE_DIR=${KERNEL_SOURCE_DIR} -D KERNELS_CPP=${KERNELS_CPP} -D KERNELS_H=${KERNELS_H} -D KERNEL_SOURCE_CLASS=${KERNEL_SOURCE_CLASS} -D KERNEL_FILE_EXTENSION=cu -P ${CMAKE_SOURCE_DIR}/cmake_modules/EncodeKernelFiles.cmake
    DEPENDS ${CUDA_KERNELS}
 )
-SET_SOURCE_FILES_PROPERTIES(${CUDA_KERNELS_CPP} ${CUDA_KERNELS_H} PROPERTIES GENERATED TRUE)
+SET_SOURCE_FILES_PROPERTIES(${KERNELS_CPP} ${KERNELS_H} PROPERTIES GENERATED TRUE)
 ADD_LIBRARY(${STATIC_TARGET} STATIC ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES})

 TARGET_LINK_LIBRARIES(${STATIC_TARGET} ${OPENMM_LIBRARY_NAME} ${CUDA_CUDA_LIBRARY} ${CUDA_cufft_LIBRARY} ${PTHREADS_LIB_STATIC})
-SET_TARGET_PROPERTIES(${STATIC_TARGET} PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_CUDA_BUILDING_STATIC_LIBRARY")
+SET_TARGET_PROPERTIES(${STATIC_TARGET} PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_COMMON_BUILDING_STATIC_LIBRARY")
 IF (APPLE)
    SET_TARGET_PROPERTIES(${STATIC_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS} -F/Library/Frameworks -framework CUDA")
 ELSE (APPLE)

--- a/platforms/cuda/tests/TestCudaBAOABLangevinIntegrator.cpp
+++ b/platforms/cuda/tests/TestCudaBAOABLangevinIntegrator.cpp
@@ -30,7 +30,7 @@
 * -------------------------------------------------------------------------- */

 #include "CudaTests.h"
-#include "TestBAOABLangevinIntegrator.h"
+#include "TestLangevinMiddleIntegrator.h"

 void runPlatformTests() {
 }
--- a/platforms/cuda/tests/TestCudaNoseHooverIntegrator.cpp
+++ b/platforms/cuda/tests/TestCudaNoseHooverIntegrator.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2019 Stanford University and the Authors.           *
+ * Authors: Andreas Krämer and Andrew C. Simmmonett                           *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include "CudaTests.h"
+#include "TestNoseHooverIntegrator.h"
+
+void runPlatformTests() {
+}
--- a/platforms/cuda/tests/TestCudaNoseHooverThermostat.cpp
+++ b/platforms/cuda/tests/TestCudaNoseHooverThermostat.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2019 Stanford University and the Authors.           *
+ * Authors: Andreas Krämer and Andrew C. Simmonett                            *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include "CudaTests.h"
+#include "TestNoseHooverThermostat.h"
+
+void runPlatformTests() {
+}