Merge branch 'master' into gayberne

2e9c418a · peastman · 8f532e31 · a4d327f5 · 2e9c418a · 2e9c418a
Commit 2e9c418a authored May 05, 2016 by peastman
20 changed files
--- a/platforms/cpu/src/CpuCustomNonbondedForce.cpp
+++ b/platforms/cpu/src/CpuCustomNonbondedForce.cpp

-/* Portions copyright (c) 2009-2014 Stanford University and Simbios.
+/* Portions copyright (c) 2009-2016 Stanford University and Simbios.
 * Contributors: Peter Eastman
 *
 * Permission is hereby granted, free of charge, to any person obtaining
@@ -190,7 +190,8 @@ void CpuCustomNonbondedForce::threadComputeForce(ThreadPool& threads, int thread
            int blockIndex = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 1);
            if (blockIndex >= neighborList->getNumBlocks())
                break;
-            const int* blockAtom = &neighborList->getSortedAtoms()[4*blockIndex];
+            const int blockSize = neighborList->getBlockSize();
+            const int* blockAtom = &neighborList->getSortedAtoms()[blockSize*blockIndex];
            const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
            const vector<char>& exclusions = neighborList->getBlockExclusions(blockIndex);
            for (int i = 0; i < (int) neighbors.size(); i++) {
@@ -199,7 +200,7 @@ void CpuCustomNonbondedForce::threadComputeForce(ThreadPool& threads, int thread
                    ReferenceForce::setVariable(data.energyParticleParams[j*2], atomParameters[first][j]);
                    ReferenceForce::setVariable(data.forceParticleParams[j*2], atomParameters[first][j]);
                }
-                for (int k = 0; k < 4; k++) {
+                for (int k = 0; k < blockSize; k++) {
                    if ((exclusions[i] & (1<<k)) == 0) {
                        int second = blockAtom[k];
                        for (int j = 0; j < (int) paramNames.size(); j++) {

--- a/platforms/cpu/src/CpuKernels.cpp
+++ b/platforms/cpu/src/CpuKernels.cpp
@@ -230,6 +230,7 @@ CpuCalcForcesAndEnergyKernel::CpuCalcForcesAndEnergyKernel(std::string name, con

 void CpuCalcForcesAndEnergyKernel::initialize(const System& system) {
    referenceKernel.getAs<ReferenceCalcForcesAndEnergyKernel>().initialize(system);
+    lastPositions.resize(system.getNumParticles(), Vec3(1e10, 1e10, 1e10));
 }

 void CpuCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups) {
@@ -237,11 +238,60 @@ void CpuCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool i
    
    // Convert positions to single precision and clear the forces.

-    InitForceTask task(context.getSystem().getNumParticles(), context, data);
+    int numParticles = context.getSystem().getNumParticles();
+    InitForceTask task(numParticles, context, data);
    data.threads.execute(task);
    data.threads.waitForThreads();
    if (!task.positionsValid)
        throw OpenMMException("Particle coordinate is nan");
+
+    // Determine whether we need to recompute the neighbor list.
+        
+    if (data.neighborList != NULL) {
+        double padding = data.paddedCutoff-data.cutoff;;
+        bool needRecompute = false;
+        double closeCutoff2 = 0.25*padding*padding;
+        double farCutoff2 = 0.5*padding*padding;
+        int maxNumMoved = numParticles/10;
+        vector<int> moved;
+        vector<RealVec>& posData = extractPositions(context);
+        for (int i = 0; i < numParticles; i++) {
+            RealVec delta = posData[i]-lastPositions[i];
+            double dist2 = delta.dot(delta);
+            if (dist2 > closeCutoff2) {
+                moved.push_back(i);
+                if (dist2 > farCutoff2 || moved.size() > maxNumMoved) {
+                    needRecompute = true;
+                    break;
+                }
+            }
+        }
+        if (!needRecompute && moved.size() > 0) {
+            // Some particles have moved further than half the padding distance.  Look for pairs
+            // that are missing from the neighbor list.
+
+            int numMoved = moved.size();
+            double cutoff2 = data.cutoff*data.cutoff;
+            double paddedCutoff2 = data.paddedCutoff*data.paddedCutoff;
+            for (int i = 1; i < numMoved && !needRecompute; i++)
+                for (int j = 0; j < i; j++) {
+                    RealVec delta = posData[moved[i]]-posData[moved[j]];
+                    if (delta.dot(delta) < cutoff2) {
+                        // These particles should interact.  See if they are in the neighbor list.
+                        
+                        RealVec oldDelta = lastPositions[moved[i]]-lastPositions[moved[j]];
+                        if (oldDelta.dot(oldDelta) > paddedCutoff2) {
+                            needRecompute = true;
+                            break;
+                        }
+                    }
+                }
+        }
+        if (needRecompute) {
+            data.neighborList->computeNeighborList(numParticles, data.posq, data.exclusions, extractBoxVectors(context), data.isPeriodic, data.paddedCutoff, data.threads);
+            lastPositions = posData;
+        }
+    }
 }

 double CpuCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups, bool& valid) {
@@ -283,6 +333,7 @@ void CpuCalcHarmonicAngleForceKernel::initialize(const System& system, const Har
        angleParamArray[i][1] = (RealOpenMM) k;
    }
    bondForce.initialize(system.getNumParticles(), numAngles, 3, angleIndexArray, data.threads);
+    usePeriodic = force.usesPeriodicBoundaryConditions();
 }

 double CpuCalcHarmonicAngleForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
@@ -290,6 +341,8 @@ double CpuCalcHarmonicAngleForceKernel::execute(ContextImpl& context, bool inclu
    vector<RealVec>& forceData = extractForces(context);
    RealOpenMM energy = 0;
    ReferenceAngleBondIxn angleBond;
+    if (usePeriodic)
+        angleBond.setPeriodic(extractBoxVectors(context));
    bondForce.calculateForce(posData, angleParamArray, forceData, includeEnergy ? &energy : NULL, angleBond);
    return energy;
 }
@@ -343,6 +396,7 @@ void CpuCalcPeriodicTorsionForceKernel::initialize(const System& system, const P
        torsionParamArray[i][2] = (RealOpenMM) periodicity;
    }
    bondForce.initialize(system.getNumParticles(), numTorsions, 4, torsionIndexArray, data.threads);
+    usePeriodic = force.usesPeriodicBoundaryConditions();
 }

 double CpuCalcPeriodicTorsionForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
@@ -350,6 +404,8 @@ double CpuCalcPeriodicTorsionForceKernel::execute(ContextImpl& context, bool inc
    vector<RealVec>& forceData = extractForces(context);
    RealOpenMM energy = 0;
    ReferenceProperDihedralBond periodicTorsionBond;
+    if (usePeriodic)
+        periodicTorsionBond.setPeriodic(extractBoxVectors(context));
    bondForce.calculateForce(posData, torsionParamArray, forceData, includeEnergy ? &energy : NULL, periodicTorsionBond);
    return energy;
 }
@@ -407,6 +463,7 @@ void CpuCalcRBTorsionForceKernel::initialize(const System& system, const RBTorsi
        torsionParamArray[i][5] = (RealOpenMM) c5;
    }
    bondForce.initialize(system.getNumParticles(), numTorsions, 4, torsionIndexArray, data.threads);
+    usePeriodic = force.usesPeriodicBoundaryConditions();
 }

 double CpuCalcRBTorsionForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
@@ -414,6 +471,8 @@ double CpuCalcRBTorsionForceKernel::execute(ContextImpl& context, bool includeFo
    vector<RealVec>& forceData = extractForces(context);
    RealOpenMM energy = 0;
    ReferenceRbDihedralBond rbTorsionBond;
+    if (usePeriodic)
+        rbTorsionBond.setPeriodic(extractBoxVectors(context));
    bondForce.calculateForce(posData, torsionParamArray, forceData, includeEnergy ? &energy : NULL, rbTorsionBond);
    return energy;
 }
@@ -464,15 +523,11 @@ CpuNonbondedForce* createCpuNonbondedForceVec4();
 CpuNonbondedForce* createCpuNonbondedForceVec8();

 CpuCalcNonbondedForceKernel::CpuCalcNonbondedForceKernel(string name, const Platform& platform, CpuPlatform::PlatformData& data) : CalcNonbondedForceKernel(name, platform),
-        data(data), bonded14IndexArray(NULL), bonded14ParamArray(NULL), hasInitializedPme(false), neighborList(NULL), nonbonded(NULL) {
-    if (isVec8Supported()) {
-        neighborList = new CpuNeighborList(8);
+        data(data), bonded14IndexArray(NULL), bonded14ParamArray(NULL), hasInitializedPme(false), nonbonded(NULL) {
+    if (isVec8Supported())
        nonbonded = createCpuNonbondedForceVec8();
-    }
-    else {
-        neighborList = new CpuNeighborList(4);
+    else
        nonbonded = createCpuNonbondedForceVec4();
-    }
 }

 CpuCalcNonbondedForceKernel::~CpuCalcNonbondedForceKernel() {
@@ -486,8 +541,6 @@ CpuCalcNonbondedForceKernel::~CpuCalcNonbondedForceKernel() {
    }
    if (nonbonded != NULL)
        delete nonbonded;
-    if (neighborList != NULL)
-        delete neighborList;
 }

 void CpuCalcNonbondedForceKernel::initialize(const System& system, const NonbondedForce& force) {
@@ -547,6 +600,7 @@ void CpuCalcNonbondedForceKernel::initialize(const System& system, const Nonbond
    if (nonbondedMethod == NoCutoff)
        useSwitchingFunction = false;
    else {
+        data.requestNeighborList(nonbondedCutoff, 0.25*nonbondedCutoff, true, exclusions);
        useSwitchingFunction = force.getUseSwitchingFunction();
        switchingDistance = force.getSwitchingDistance();
    }
@@ -569,7 +623,6 @@ void CpuCalcNonbondedForceKernel::initialize(const System& system, const Nonbond
        dispersionCoefficient = NonbondedForceImpl::calcDispersionCorrection(system, force);
    else
        dispersionCoefficient = 0.0;
-    lastPositions.resize(numParticles, Vec3(1e10, 1e10, 1e10));
    data.isPeriodic = (nonbondedMethod == CutoffPeriodic || nonbondedMethod == Ewald || nonbondedMethod == PME);
 }

@@ -596,53 +649,8 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
    double energy = (includeReciprocal ? ewaldSelfEnergy : 0.0);
    bool ewald  = (nonbondedMethod == Ewald);
    bool pme  = (nonbondedMethod == PME);
-    if (nonbondedMethod != NoCutoff) {
-        // Determine whether we need to recompute the neighbor list.
-        
-        double padding = 0.25*nonbondedCutoff;
-        bool needRecompute = false;
-        double closeCutoff2 = 0.25*padding*padding;
-        double farCutoff2 = 0.5*padding*padding;
-        int maxNumMoved = numParticles/10;
-        vector<int> moved;
-        for (int i = 0; i < numParticles; i++) {
-            RealVec delta = posData[i]-lastPositions[i];
-            double dist2 = delta.dot(delta);
-            if (dist2 > closeCutoff2) {
-                moved.push_back(i);
-                if (dist2 > farCutoff2 || moved.size() > maxNumMoved) {
-                    needRecompute = true;
-                    break;
-                }
-            }
-        }
-        if (!needRecompute && moved.size() > 0) {
-            // Some particles have moved further than half the padding distance.  Look for pairs
-            // that are missing from the neighbor list.
-
-            int numMoved = moved.size();
-            double cutoff2 = nonbondedCutoff*nonbondedCutoff;
-            double paddedCutoff2 = (nonbondedCutoff+padding)*(nonbondedCutoff+padding);
-            for (int i = 1; i < numMoved && !needRecompute; i++)
-                for (int j = 0; j < i; j++) {
-                    RealVec delta = posData[moved[i]]-posData[moved[j]];
-                    if (delta.dot(delta) < cutoff2) {
-                        // These particles should interact.  See if they are in the neighbor list.
-                        
-                        RealVec oldDelta = lastPositions[moved[i]]-lastPositions[moved[j]];
-                        if (oldDelta.dot(oldDelta) > paddedCutoff2) {
-                            needRecompute = true;
-                            break;
-                        }
-                    }
-                }
-        }
-        if (needRecompute) {
-            neighborList->computeNeighborList(numParticles, posq, exclusions, boxVectors, data.isPeriodic, nonbondedCutoff+padding, data.threads);
-            lastPositions = posData;
-        }
-        nonbonded->setUseCutoff(nonbondedCutoff, *neighborList, rfDielectric);
-    }
+    if (nonbondedMethod != NoCutoff)
+        nonbonded->setUseCutoff(nonbondedCutoff, *data.neighborList, rfDielectric);
    if (data.isPeriodic) {
        RealVec* boxVectors = extractBoxVectors(context);
        double minAllowedSize = 1.999999*nonbondedCutoff;
@@ -739,7 +747,7 @@ void CpuCalcNonbondedForceKernel::getPMEParameters(double& alpha, int& nx, int&
 }

 CpuCalcCustomNonbondedForceKernel::CpuCalcCustomNonbondedForceKernel(string name, const Platform& platform, CpuPlatform::PlatformData& data) :
-            CalcCustomNonbondedForceKernel(name, platform), data(data), forceCopy(NULL), neighborList(NULL), nonbonded(NULL) {
+            CalcCustomNonbondedForceKernel(name, platform), data(data), forceCopy(NULL), nonbonded(NULL) {
 }

 CpuCalcCustomNonbondedForceKernel::~CpuCalcCustomNonbondedForceKernel() {
@@ -748,8 +756,6 @@ CpuCalcCustomNonbondedForceKernel::~CpuCalcCustomNonbondedForceKernel() {
            delete[] particleParamArray[i];
        delete[] particleParamArray;
    }
-    if (neighborList != NULL)
-        delete neighborList;
    if (nonbonded != NULL)
        delete nonbonded;
    if (forceCopy != NULL)
@@ -786,7 +792,7 @@ void CpuCalcCustomNonbondedForceKernel::initialize(const System& system, const C
    if (nonbondedMethod == NoCutoff)
        useSwitchingFunction = false;
    else {
-        neighborList = new CpuNeighborList(4);
+        data.requestNeighborList(nonbondedCutoff, 0.25*nonbondedCutoff, true, exclusions);
        useSwitchingFunction = force.getUseSwitchingFunction();
        switchingDistance = force.getSwitchingDistance();
    }
@@ -852,10 +858,8 @@ double CpuCalcCustomNonbondedForceKernel::execute(ContextImpl& context, bool inc
    RealVec* boxVectors = extractBoxVectors(context);
    double energy = 0;
    bool periodic = (nonbondedMethod == CutoffPeriodic);
-    if (nonbondedMethod != NoCutoff) {
-        neighborList->computeNeighborList(numParticles, data.posq, exclusions, boxVectors, data.isPeriodic, nonbondedCutoff, data.threads);
-        nonbonded->setUseCutoff(nonbondedCutoff, *neighborList);
-    }
+    if (nonbondedMethod != NoCutoff)
+        nonbonded->setUseCutoff(nonbondedCutoff, *data.neighborList);
    if (periodic) {
        double minAllowedSize = 2*nonbondedCutoff;
        if (boxVectors[0][0] < minAllowedSize || boxVectors[1][1] < minAllowedSize || boxVectors[2][2] < minAllowedSize)
@@ -963,8 +967,6 @@ CpuCalcCustomGBForceKernel::~CpuCalcCustomGBForceKernel() {
            delete[] particleParamArray[i];
        delete[] particleParamArray;
    }
-    if (neighborList != NULL)
-        delete neighborList;
    if (ixn != NULL)
        delete ixn;
 }
@@ -1012,10 +1014,8 @@ void CpuCalcCustomGBForceKernel::initialize(const System& system, const CustomGB
        globalParameterNames.push_back(force.getGlobalParameterName(i));
    nonbondedMethod = CalcCustomGBForceKernel::NonbondedMethod(force.getNonbondedMethod());
    nonbondedCutoff = (RealOpenMM) force.getCutoffDistance();
-    if (nonbondedMethod == NoCutoff)
-        neighborList = NULL;
-    else
-        neighborList = new CpuNeighborList(4);
+    if (nonbondedMethod != NoCutoff)
+        data.requestNeighborList(nonbondedCutoff, 0.25*nonbondedCutoff, true, exclusions);

    // Create custom functions for the tabulated functions.

@@ -1112,8 +1112,7 @@ double CpuCalcCustomGBForceKernel::execute(ContextImpl& context, bool includeFor
        ixn->setPeriodic(extractBoxSize(context));
    if (nonbondedMethod != NoCutoff) {
        vector<set<int> > noExclusions(numParticles);
-        neighborList->computeNeighborList(numParticles, data.posq, exclusions, boxVectors, data.isPeriodic, nonbondedCutoff, data.threads);
-        ixn->setUseCutoff(nonbondedCutoff, *neighborList);
+        ixn->setUseCutoff(nonbondedCutoff, *data.neighborList);
    }
    map<string, double> globalParameters;
    for (int i = 0; i < (int) globalParameterNames.size(); i++)

--- a/platforms/cpu/src/CpuNeighborList.cpp
+++ b/platforms/cpu/src/CpuNeighborList.cpp
@@ -225,38 +225,78 @@ public:
                if (usePeriodic)
                    voxelIndex.y = (y < 0 ? y+ny : (y >= ny ? y-ny : y));
                float boxy = floor((float) y/ny);
-                float xoffset = (float) (usePeriodic ? boxy*periodicBoxVectors[1][0]+boxz*periodicBoxVectors[2][0] : 0);
                
                // Identify the range of atoms within this bin we need to search.  When using periodic boundary
                // conditions, there may be two separate ranges.
                
                float minx = centerPos[0];
                float maxx = centerPos[0];
-                fvec4 offset(-xoffset, -yoffset+voxelSizeY*y+(usePeriodic ? 0.0f : miny), voxelSizeZ*z+(usePeriodic ? 0.0f : minz), 0);
-                for (int k = 0; k < (int) blockAtoms.size(); k++) {
-                    const float* atomPos = &sortedPositions[4*(blockSize*blockIndex+k)];
-                    fvec4 posVec(atomPos);
-                    fvec4 delta1 = offset-posVec;
-                    fvec4 delta2 = delta1+fvec4(0, voxelSizeY, voxelSizeZ, 0);
-                    if (usePeriodic) {
-                        delta1 -= round(delta1*invBoxSize)*boxSize;
-                        delta2 -= round(delta2*invBoxSize)*boxSize;
+                if (usePeriodic && triclinic) {
+                    for (int k = 0; k < (int) blockAtoms.size(); k++) {
+                        const float* atomPos = &sortedPositions[4*(blockSize*blockIndex+k)];
+                        fvec4 delta1(0, voxelSizeY*voxelIndex.y-atomPos[1], voxelSizeZ*voxelIndex.z-atomPos[2], 0);
+                        fvec4 delta2 = delta1+fvec4(0, voxelSizeY, 0, 0);
+                        fvec4 delta3 = delta1+fvec4(0, 0, voxelSizeZ, 0);
+                        fvec4 delta4 = delta1+fvec4(0, voxelSizeY, voxelSizeZ, 0);
+                        delta1 -= periodicBoxVec4[2]*floorf(delta1[2]*recipBoxSize[2]+0.5f);
+                        delta1 -= periodicBoxVec4[1]*floorf(delta1[1]*recipBoxSize[1]+0.5f);
+                        delta1 -= periodicBoxVec4[0]*floorf(delta1[0]*recipBoxSize[0]+0.5f);
+                        delta2 -= periodicBoxVec4[2]*floorf(delta2[2]*recipBoxSize[2]+0.5f);
+                        delta2 -= periodicBoxVec4[1]*floorf(delta2[1]*recipBoxSize[1]+0.5f);
+                        delta2 -= periodicBoxVec4[0]*floorf(delta2[0]*recipBoxSize[0]+0.5f);
+                        delta3 -= periodicBoxVec4[2]*floorf(delta3[2]*recipBoxSize[2]+0.5f);
+                        delta3 -= periodicBoxVec4[1]*floorf(delta3[1]*recipBoxSize[1]+0.5f);
+                        delta3 -= periodicBoxVec4[0]*floorf(delta3[0]*recipBoxSize[0]+0.5f);
+                        delta4 -= periodicBoxVec4[2]*floorf(delta4[2]*recipBoxSize[2]+0.5f);
+                        delta4 -= periodicBoxVec4[1]*floorf(delta4[1]*recipBoxSize[1]+0.5f);
+                        delta4 -= periodicBoxVec4[0]*floorf(delta4[0]*recipBoxSize[0]+0.5f);
+                        if (delta1[1] < 0 && delta1[1]+voxelSizeY > 0)
+                            delta1 = fvec4(delta1[0], 0, delta1[2], 0);
+                        if (delta1[2] < 0 && delta1[2]+voxelSizeZ > 0)
+                            delta1 = fvec4(delta1[0], delta1[1], 0, 0);
+                        if (delta3[1] < 0 && delta3[1]+voxelSizeY > 0)
+                            delta3 = fvec4(delta3[0], 0, delta3[2], 0);
+                        if (delta2[2] < 0 && delta2[2]+voxelSizeZ > 0)
+                            delta2 = fvec4(delta2[0], delta2[1], 0, 0);
+                        fvec4 delta = min(min(min(abs(delta1), abs(delta2)), abs(delta3)), abs(delta4));
+                        float dy = (voxelIndex.y == atomVoxelIndex[k].y ? 0.0f : delta[1]);
+                        float dz = (voxelIndex.z == atomVoxelIndex[k].z ? 0.0f : delta[2]);
+                        float dist2 = maxDistanceSquared-dy*dy-dz*dz;
+                        if (dist2 > 0) {
+                            float dist = sqrtf(dist2);
+                            minx = min(minx, atomPos[0]-dist-max(max(max(delta1[0], delta2[0]), delta3[0]), delta4[0]));
+                            maxx = max(maxx, atomPos[0]+dist-min(min(min(delta1[0], delta2[0]), delta3[0]), delta4[0]));
+                        }
                    }
-                    fvec4 delta = min(abs(delta1), abs(delta2));
-                    float dy = (y == atomVoxelIndex[k].y ? 0.0f : delta[1]);
-                    float dz = (z == atomVoxelIndex[k].z ? 0.0f : delta[2]);
-                    float dist2 = maxDistanceSquared-dy*dy-dz*dz;
-                    if (dist2 > 0) {
-                        float dist = sqrtf(dist2);
-                        minx = min(minx, atomPos[0]-dist-xoffset);
-                        maxx = max(maxx, atomPos[0]+dist-xoffset);
+                }
+                else {
+                    float xoffset = (float) (usePeriodic ? boxy*periodicBoxVectors[1][0]+boxz*periodicBoxVectors[2][0] : 0);
+                    fvec4 offset(-xoffset, -yoffset+voxelSizeY*y+(usePeriodic ? 0.0f : miny), voxelSizeZ*z+(usePeriodic ? 0.0f : minz), 0);
+                    for (int k = 0; k < (int) blockAtoms.size(); k++) {
+                        const float* atomPos = &sortedPositions[4*(blockSize*blockIndex+k)];
+                        fvec4 posVec(atomPos);
+                        fvec4 delta1 = offset-posVec;
+                        fvec4 delta2 = delta1+fvec4(0, voxelSizeY, voxelSizeZ, 0);
+                        if (usePeriodic) {
+                            delta1 -= round(delta1*invBoxSize)*boxSize;
+                            delta2 -= round(delta2*invBoxSize)*boxSize;
+                        }
+                        fvec4 delta = min(abs(delta1), abs(delta2));
+                        float dy = (y == atomVoxelIndex[k].y ? 0.0f : delta[1]);
+                        float dz = (z == atomVoxelIndex[k].z ? 0.0f : delta[2]);
+                        float dist2 = maxDistanceSquared-dy*dy-dz*dz;
+                        if (dist2 > 0) {
+                            float dist = sqrtf(dist2);
+                            minx = min(minx, atomPos[0]-dist-xoffset);
+                            maxx = max(maxx, atomPos[0]+dist-xoffset);
+                        }
                    }
                }
                if (minx == maxx)
                    continue;
-                bool needPeriodic = (centerPos[1]-blockWidth[1] < maxDistance || centerPos[1]+blockWidth[1] > periodicBoxSize[1]-maxDistance ||
-                                     centerPos[2]-blockWidth[2] < maxDistance || centerPos[2]+blockWidth[2] > periodicBoxSize[2]-maxDistance ||
-                                     minx < 0.0f || maxx > periodicBoxVectors[0][0]);
+                bool needPeriodic = usePeriodic && (centerPos[1]-blockWidth[1] < maxDistance || centerPos[1]+blockWidth[1] > periodicBoxSize[1]-maxDistance ||
+                                                    centerPos[2]-blockWidth[2] < maxDistance || centerPos[2]+blockWidth[2] > periodicBoxSize[2]-maxDistance ||
+                                                    minx < 0.0f || maxx > periodicBoxVectors[0][0]);
                int numRanges;
                int rangeStart[2];
                int rangeEnd[2];
@@ -294,7 +334,7 @@ public:
                            continue;
                        
                        fvec4 atomPos(&sortedPositions[4*sortedIndex]);
-                        fvec4 delta = atomPos-centerPos;
+                        fvec4 delta = atomPos-blockCenter;
                        if (periodicRectangular)
                            delta -= round(delta*invBoxSize)*boxSize;
                        else if (needPeriodic) {
@@ -468,6 +508,10 @@ int CpuNeighborList::getNumBlocks() const {
    return sortedAtoms.size()/blockSize;
 }

+int CpuNeighborList::getBlockSize() const {
+    return blockSize;
+}
+
 const std::vector<int>& CpuNeighborList::getSortedAtoms() const {
    return sortedAtoms;
 }

--- a/platforms/cpu/src/CpuPlatform.cpp
+++ b/platforms/cpu/src/CpuPlatform.cpp
@@ -34,6 +34,7 @@
 #include "CpuKernels.h"
 #include "CpuSETTLE.h"
 #include "ReferenceConstraints.h"
+#include "openmm/OpenMMException.h"
 #include "openmm/internal/hardware.h"
 #include "openmm/internal/vectorize.h"
 #include <sstream>
@@ -59,6 +60,7 @@ extern "C" OPENMM_EXPORT_CPU void registerPlatforms() {
 map<const ContextImpl*, CpuPlatform::PlatformData*> CpuPlatform::contextData;

 CpuPlatform::CpuPlatform() {
+    deprecatedPropertyReplacements["CpuThreads"] = CpuThreads();
    CpuKernelFactory* factory = new CpuKernelFactory();
    registerKernelFactory(CalcForcesAndEnergyKernel::Name(), factory);
    registerKernelFactory(CalcHarmonicAngleForceKernel::Name(), factory);
@@ -84,7 +86,10 @@ CpuPlatform::CpuPlatform() {
 const string& CpuPlatform::getPropertyValue(const Context& context, const string& property) const {
    const ContextImpl& impl = getContextImpl(context);
    const PlatformData& data = getPlatformData(impl);
-    map<string, string>::const_iterator value = data.propertyValues.find(property);
+    string propertyName = property;
+    if (deprecatedPropertyReplacements.find(property) != deprecatedPropertyReplacements.end())
+        propertyName = deprecatedPropertyReplacements.find(property)->second;
+    map<string, string>::const_iterator value = data.propertyValues.find(propertyName);
    if (value != data.propertyValues.end())
        return value->second;
    return ReferencePlatform::getPropertyValue(context, property);
@@ -132,7 +137,8 @@ const CpuPlatform::PlatformData& CpuPlatform::getPlatformData(const ContextImpl&
    return *contextData[&context];
 }

-CpuPlatform::PlatformData::PlatformData(int numParticles, int numThreads) : posq(4*numParticles), threads(numThreads) {
+CpuPlatform::PlatformData::PlatformData(int numParticles, int numThreads) : posq(4*numParticles), threads(numThreads),
+        neighborList(NULL), cutoff(0.0), paddedCutoff(0.0), anyExclusions(false) {
    numThreads = threads.getNumThreads();
    threadForce.resize(numThreads);
    for (int i = 0; i < numThreads; i++)
@@ -142,3 +148,27 @@ CpuPlatform::PlatformData::PlatformData(int numParticles, int numThreads) : posq
    threadsProperty << numThreads;
    propertyValues[CpuThreads()] = threadsProperty.str();
 }
+
+CpuPlatform::PlatformData::~PlatformData() {
+    if (neighborList != NULL)
+        delete neighborList;
+}
+
+bool isVec8Supported();
+
+void CpuPlatform::PlatformData::requestNeighborList(double cutoffDistance, double padding, bool useExclusions, vector<set<int> >& exclusionList) {
+    if (neighborList == NULL)
+        neighborList = new CpuNeighborList(isVec8Supported() ? 8 : 4);
+    if (cutoffDistance > cutoff)
+        cutoff = cutoffDistance;
+    if (cutoffDistance+padding > paddedCutoff)
+        paddedCutoff = cutoffDistance+padding;
+    if (useExclusions) {
+        if (anyExclusions && exclusions != exclusionList)
+            throw OpenMMException("All Forces must have identical exclusions");
+        else {
+            exclusions = exclusionList;
+            anyExclusions = true;
+        }
+    }
+}
--- a/platforms/cpu/tests/TestCpuNeighborList.cpp
+++ b/platforms/cpu/tests/TestCpuNeighborList.cpp
@@ -53,14 +53,14 @@ void testNeighborList(bool periodic, bool triclinic) {
    const float cutoff = 2.0f;
    RealVec boxVectors[3];
    if (triclinic) {
-        boxVectors[0] = RealVec(20, 0, 0);
-        boxVectors[1] = RealVec(5, 15, 0);
-        boxVectors[2] = RealVec(-3, -7, 22);
+        boxVectors[0] = RealVec(10, 0, 0);
+        boxVectors[1] = RealVec(4, 9, 0);
+        boxVectors[2] = RealVec(-3, -3.5, 11);
    }
    else {
-        boxVectors[0] = RealVec(20, 0, 0);
-        boxVectors[1] = RealVec(0, 15, 0);
-        boxVectors[2] = RealVec(0, 0, 22);
+        boxVectors[0] = RealVec(10, 0, 0);
+        boxVectors[1] = RealVec(0, 9, 0);
+        boxVectors[2] = RealVec(0, 0, 11);
    }
    const float boxSize[3] = {(float) boxVectors[0][0], (float) boxVectors[1][1], (float) boxVectors[2][2]};
    const int blockSize = 8;

--- a/platforms/cuda/include/CudaKernels.h
+++ b/platforms/cuda/include/CudaKernels.h
@@ -178,7 +178,7 @@ public:
     * @param b      the vector defining the second edge of the periodic box
     * @param c      the vector defining the third edge of the periodic box
     */
-    void setPeriodicBoxVectors(ContextImpl& context, const Vec3& a, const Vec3& b, const Vec3& c) const;
+    void setPeriodicBoxVectors(ContextImpl& context, const Vec3& a, const Vec3& b, const Vec3& c);
    /**
     * Create a checkpoint recording the current state of the Context.
     * 
@@ -718,7 +718,7 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const CustomNonbondedForce& force);
 private:
-    void initInteractionGroups(const CustomNonbondedForce& force, const std::string& interactionSource);
+    void initInteractionGroups(const CustomNonbondedForce& force, const std::string& interactionSource, const std::vector<std::string>& tableTypes);
    CudaContext& cu;
    CudaParameterSet* params;
    CudaArray* globals;

--- a/platforms/cuda/include/CudaPlatform.h
+++ b/platforms/cuda/include/CudaPlatform.h
@@ -58,35 +58,35 @@ public:
     * This is the name of the parameter for selecting which CUDA device or devices to use.
     */
    static const std::string& CudaDeviceIndex() {
-        static const std::string key = "CudaDeviceIndex";
+        static const std::string key = "DeviceIndex";
        return key;
    }
    /**
     * This is the name of the parameter that reports the CUDA device or devices being used.
     */
    static const std::string& CudaDeviceName() {
-        static const std::string key = "CudaDeviceName";
+        static const std::string key = "DeviceName";
        return key;
    }
    /**
     * This is the name of the parameter for selecting whether CUDA should sync or spin loop while waiting for results.
     */
    static const std::string& CudaUseBlockingSync() {
-        static const std::string key = "CudaUseBlockingSync";
+        static const std::string key = "UseBlockingSync";
        return key;
    }
    /**
     * This is the name of the parameter for selecting what numerical precision to use.
     */
    static const std::string& CudaPrecision() {
-        static const std::string key = "CudaPrecision";
+        static const std::string key = "Precision";
        return key;
    }
    /**
     * This is the name of the parameter for selecting whether to use the CPU based PME calculation.
     */
    static const std::string& CudaUseCpuPme() {
-        static const std::string key = "CudaUseCpuPme";
+        static const std::string key = "UseCpuPme";
        return key;
    }
    /**
@@ -107,14 +107,21 @@ public:
     * This is the name of the parameter for specifying the path to the directory for creating temporary files.
     */
    static const std::string& CudaTempDirectory() {
-        static const std::string key = "CudaTempDirectory";
+        static const std::string key = "TempDirectory";
        return key;
    }
    /**
     * This is the name of the parameter for selecting whether to disable use of a separate stream for PME.
     */
    static const std::string& CudaDisablePmeStream() {
-        static const std::string key = "CudaDisablePmeStream";
+        static const std::string key = "DisablePmeStream";
+        return key;
+    }
+    /**
+     * This is the name of the parameter for requesting that force computations be fully deterministic.
+     */
+    static const std::string& CudaDeterministicForces() {
+        static const std::string key = "DeterministicForces";
        return key;
    }
 };
@@ -123,14 +130,14 @@ class OPENMM_EXPORT_CUDA CudaPlatform::PlatformData {
 public:
    PlatformData(ContextImpl* context, const System& system, const std::string& deviceIndexProperty, const std::string& blockingProperty, const std::string& precisionProperty,
            const std::string& cpuPmeProperty, const std::string& compilerProperty, const std::string& tempProperty, const std::string& hostCompilerProperty,
-            const std::string& pmeStreamProperty, int numThreads);
+            const std::string& pmeStreamProperty, const std::string& deterministicForcesProperty, int numThreads);
    ~PlatformData();
    void initializeContexts(const System& system);
    void syncContexts();
    ContextImpl* context;
    std::vector<CudaContext*> contexts;
    std::vector<double> contextEnergy;
-    bool hasInitializedContexts, removeCM, peerAccessSupported, useCpuPme, disablePmeStream;
+    bool hasInitializedContexts, removeCM, peerAccessSupported, useCpuPme, disablePmeStream, deterministicForces;
    int cmMotionFrequency;
    int stepCount, computeForceCount;
    double time;

--- a/platforms/cuda/src/CudaContext.cpp
+++ b/platforms/cuda/src/CudaContext.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2015 Stanford University and the Authors.      *
+ * Portions copyright (c) 2009-2016 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -106,7 +106,7 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
        useMixedPrecision = false;
    }
    else
-        throw OpenMMException("Illegal value for CudaPrecision: "+precision);
+        throw OpenMMException("Illegal value for Precision: "+precision);
    char* cacheVariable = getenv("OPENMM_CACHE_DIR");
    cacheDir = (cacheVariable == NULL ? tempDir : string(cacheVariable));
 #ifdef WIN32
@@ -121,7 +121,7 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
    string errorMessage = "Error initializing Context";
    CHECK_RESULT(cuDeviceGetCount(&numDevices));
    if (deviceIndex < -1 || deviceIndex >= numDevices)
-        throw OpenMMException("Illegal value for CudaDeviceIndex: "+intToString(deviceIndex));
+        throw OpenMMException("Illegal value for DeviceIndex: "+intToString(deviceIndex));

    vector<int> devicePrecedence;
    if (deviceIndex == -1) {

--- a/platforms/cuda/src/CudaExpressionUtilities.cpp
+++ b/platforms/cuda/src/CudaExpressionUtilities.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2015 Stanford University and the Authors.      *
+ * Portions copyright (c) 2009-2016 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -254,7 +254,7 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express
                            for (int k = 3; k >= 0; k--)
                                for (int m = 0; m < 4; m++) {
                                    int base = 4*m;
-                                    string suffix = suffixes[m];
+                                    string suffix = suffixes[k];
                                    out << "derivy[" << m << "] = da*derivy[" << m << "] + (3*c[" << (base+3) << "]" << suffix << "*db + 2*c[" << (base+2) << "]" << suffix << ")*db + c[" << (base+1) << "]" << suffix << ";\n";
                                }
                            out << nodeNames[j] << " = derivy[0] + dc*(derivy[1] + dc*(derivy[2] + dc*derivy[3]));\n";
@@ -271,7 +271,7 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express
                            out << nodeNames[j] << " *= " << paramsFloat[11] << ";\n";
                        }
                        else
-                            throw OpenMMException("Unsupported derivative order for Continuous2DFunction");
+                            throw OpenMMException("Unsupported derivative order for Continuous3DFunction");
                    }
                    out << "}\n";
                }

--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
--- a/platforms/cuda/src/CudaPlatform.cpp
+++ b/platforms/cuda/src/CudaPlatform.cpp
@@ -62,6 +62,14 @@ extern "C" OPENMM_EXPORT_CUDA void registerPlatforms() {
 #endif

 CudaPlatform::CudaPlatform() {
+    deprecatedPropertyReplacements["CudaDeviceIndex"] = CudaDeviceIndex();
+    deprecatedPropertyReplacements["CudaDeviceName"] = CudaDeviceName();
+    deprecatedPropertyReplacements["CudaUseBlockingSync"] = CudaUseBlockingSync();
+    deprecatedPropertyReplacements["CudaPrecision"] = CudaPrecision();
+    deprecatedPropertyReplacements["CudaUseCpuPme"] = CudaUseCpuPme();
+    deprecatedPropertyReplacements["CudaTempDirectory"] = CudaTempDirectory();
+    deprecatedPropertyReplacements["CudaDisablePmeStream"] = CudaDisablePmeStream();
+    deprecatedPropertyReplacements["CudaDeterministicForces"] = CudaDeterministicForces();
    CudaKernelFactory* factory = new CudaKernelFactory();
    registerKernelFactory(CalcForcesAndEnergyKernel::Name(), factory);
    registerKernelFactory(UpdateStateDataKernel::Name(), factory);
@@ -102,12 +110,14 @@ CudaPlatform::CudaPlatform() {
    platformProperties.push_back(CudaTempDirectory());
    platformProperties.push_back(CudaHostCompiler());
    platformProperties.push_back(CudaDisablePmeStream());
+    platformProperties.push_back(CudaDeterministicForces());
    setPropertyDefaultValue(CudaDeviceIndex(), "");
    setPropertyDefaultValue(CudaDeviceName(), "");
    setPropertyDefaultValue(CudaUseBlockingSync(), "true");
    setPropertyDefaultValue(CudaPrecision(), "single");
    setPropertyDefaultValue(CudaUseCpuPme(), "false");
    setPropertyDefaultValue(CudaDisablePmeStream(), "false");
+    setPropertyDefaultValue(CudaDeterministicForces(), "false");
 #ifdef _MSC_VER
    char* bindir = getenv("CUDA_BIN_PATH");
    string nvcc = (bindir == NULL ? "nvcc.exe" : string(bindir)+"\\nvcc.exe");
@@ -142,7 +152,10 @@ bool CudaPlatform::supportsDoublePrecision() const {
 const string& CudaPlatform::getPropertyValue(const Context& context, const string& property) const {
    const ContextImpl& impl = getContextImpl(context);
    const PlatformData* data = reinterpret_cast<const PlatformData*>(impl.getPlatformData());
-    map<string, string>::const_iterator value = data->propertyValues.find(property);
+    string propertyName = property;
+    if (deprecatedPropertyReplacements.find(property) != deprecatedPropertyReplacements.end())
+        propertyName = deprecatedPropertyReplacements.find(property)->second;
+    map<string, string>::const_iterator value = data->propertyValues.find(propertyName);
    if (value != data->propertyValues.end())
        return value->second;
    return Platform::getPropertyValue(context, property);
@@ -168,10 +181,13 @@ void CudaPlatform::contextCreated(ContextImpl& context, const map<string, string
            getPropertyDefaultValue(CudaHostCompiler()) : properties.find(CudaHostCompiler())->second);
    string pmeStreamPropValue = (properties.find(CudaDisablePmeStream()) == properties.end() ?
            getPropertyDefaultValue(CudaDisablePmeStream()) : properties.find(CudaDisablePmeStream())->second);
+    string deterministicForcesValue = (properties.find(CudaDeterministicForces()) == properties.end() ?
+            getPropertyDefaultValue(CudaDeterministicForces()) : properties.find(CudaDeterministicForces())->second);
    transform(blockingPropValue.begin(), blockingPropValue.end(), blockingPropValue.begin(), ::tolower);
    transform(precisionPropValue.begin(), precisionPropValue.end(), precisionPropValue.begin(), ::tolower);
    transform(cpuPmePropValue.begin(), cpuPmePropValue.end(), cpuPmePropValue.begin(), ::tolower);
    transform(pmeStreamPropValue.begin(), pmeStreamPropValue.end(), pmeStreamPropValue.begin(), ::tolower);
+    transform(deterministicForcesValue.begin(), deterministicForcesValue.end(), deterministicForcesValue.begin(), ::tolower);
    vector<string> pmeKernelName;
    pmeKernelName.push_back(CalcPmeReciprocalForceKernel::Name());
    if (!supportsKernels(pmeKernelName))
@@ -180,7 +196,8 @@ void CudaPlatform::contextCreated(ContextImpl& context, const map<string, string
    char* threadsEnv = getenv("OPENMM_CPU_THREADS");
    if (threadsEnv != NULL)
        stringstream(threadsEnv) >> threads;
-    context.setPlatformData(new PlatformData(&context, context.getSystem(), devicePropValue, blockingPropValue, precisionPropValue, cpuPmePropValue, compilerPropValue, tempPropValue, hostCompilerPropValue, pmeStreamPropValue, threads));
+    context.setPlatformData(new PlatformData(&context, context.getSystem(), devicePropValue, blockingPropValue, precisionPropValue, cpuPmePropValue, compilerPropValue, tempPropValue,
+            hostCompilerPropValue, pmeStreamPropValue, deterministicForcesValue, threads));
 }

 void CudaPlatform::contextDestroyed(ContextImpl& context) const {
@@ -189,7 +206,8 @@ void CudaPlatform::contextDestroyed(ContextImpl& context) const {
 }

 CudaPlatform::PlatformData::PlatformData(ContextImpl* context, const System& system, const string& deviceIndexProperty, const string& blockingProperty, const string& precisionProperty,
-            const string& cpuPmeProperty, const string& compilerProperty, const string& tempProperty, const string& hostCompilerProperty, const string& pmeStreamProperty, int numThreads) :
+            const string& cpuPmeProperty, const string& compilerProperty, const string& tempProperty, const string& hostCompilerProperty, const string& pmeStreamProperty,
+            const string& deterministicForcesProperty, int numThreads) :
                context(context), removeCM(false), stepCount(0), computeForceCount(0), time(0.0), hasInitializedContexts(false), threads(numThreads) {
    bool blocking = (blockingProperty == "true");
    vector<string> devices;
@@ -230,6 +248,7 @@ CudaPlatform::PlatformData::PlatformData(ContextImpl* context, const System& sys
    }
    useCpuPme = (cpuPmeProperty == "true" && !contexts[0]->getUseDoublePrecision());
    disablePmeStream = (pmeStreamProperty == "true");
+    deterministicForces = (deterministicForcesProperty == "true");
    propertyValues[CudaPlatform::CudaDeviceIndex()] = deviceIndex.str();
    propertyValues[CudaPlatform::CudaDeviceName()] = deviceName.str();
    propertyValues[CudaPlatform::CudaUseBlockingSync()] = blocking ? "true" : "false";
@@ -239,6 +258,7 @@ CudaPlatform::PlatformData::PlatformData(ContextImpl* context, const System& sys
    propertyValues[CudaPlatform::CudaTempDirectory()] = tempProperty;
    propertyValues[CudaPlatform::CudaHostCompiler()] = hostCompilerProperty;
    propertyValues[CudaPlatform::CudaDisablePmeStream()] = disablePmeStream ? "true" : "false";
+    propertyValues[CudaPlatform::CudaDeterministicForces()] = deterministicForces ? "true" : "false";
    contextEnergy.resize(contexts.size());
    
    // Determine whether peer-to-peer copying is supported, and enable it if so.

--- a/platforms/cuda/src/kernels/angleForce.cu
+++ b/platforms/cuda/src/kernels/angleForce.cu
 real3 v0 = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
 real3 v1 = make_real3(pos2.x-pos3.x, pos2.y-pos3.y, pos2.z-pos3.z);
+#if APPLY_PERIODIC
+APPLY_PERIODIC_TO_DELTA(v0)
+APPLY_PERIODIC_TO_DELTA(v1)
+#endif
 real3 cp = cross(v0, v1);
 real rp = cp.x*cp.x + cp.y*cp.y + cp.z*cp.z;
 rp = max(SQRT(rp), (real) 1.0e-06f);

--- a/platforms/cuda/src/kernels/bondForce.cu
+++ b/platforms/cuda/src/kernels/bondForce.cu
 real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
+#if APPLY_PERIODIC
+APPLY_PERIODIC_TO_DELTA(delta)
+#endif
 real r = SQRT(delta.x*delta.x + delta.y*delta.y + delta.z*delta.z);
 COMPUTE_FORCE
 dEdR = (r > 0) ? (dEdR / r) : 0;

--- a/platforms/cuda/src/kernels/cmapTorsionForce.cu
+++ b/platforms/cuda/src/kernels/cmapTorsionForce.cu
@@ -5,6 +5,11 @@ const real PI = (real) 3.14159265358979323846;
 real3 v0a = make_real3(pos1.x-pos2.x, pos1.y-pos2.y, pos1.z-pos2.z);
 real3 v1a = make_real3(pos3.x-pos2.x, pos3.y-pos2.y, pos3.z-pos2.z);
 real3 v2a = make_real3(pos3.x-pos4.x, pos3.y-pos4.y, pos3.z-pos4.z);
+#if APPLY_PERIODIC
+APPLY_PERIODIC_TO_DELTA(v0a)
+APPLY_PERIODIC_TO_DELTA(v1a)
+APPLY_PERIODIC_TO_DELTA(v2a)
+#endif
 real3 cp0a = cross(v0a, v1a);
 real3 cp1a = cross(v1a, v2a);
 real cosangle = dot(normalize(cp0a), normalize(cp1a));
@@ -28,6 +33,11 @@ angleA = fmod(angleA+2.0f*PI, 2.0f*PI);
 real3 v0b = make_real3(pos5.x-pos6.x, pos5.y-pos6.y, pos5.z-pos6.z);
 real3 v1b = make_real3(pos7.x-pos6.x, pos7.y-pos6.y, pos7.z-pos6.z);
 real3 v2b = make_real3(pos7.x-pos8.x, pos7.y-pos8.y, pos7.z-pos8.z);
+#if APPLY_PERIODIC
+APPLY_PERIODIC_TO_DELTA(v0b)
+APPLY_PERIODIC_TO_DELTA(v1b)
+APPLY_PERIODIC_TO_DELTA(v2b)
+#endif
 real3 cp0b = cross(v0b, v1b);
 real3 cp1b = cross(v1b, v2b);
 cosangle = dot(normalize(cp0b), normalize(cp1b));

--- a/platforms/cuda/src/kernels/customCentroidBond.cu
+++ b/platforms/cuda/src/kernels/customCentroidBond.cu
@@ -66,8 +66,11 @@ inline __device__ real3 trim(real4 v) {
 /**
 * Compute the difference between two vectors, setting the fourth component to the squared magnitude.
 */
-inline __device__ real4 delta(real4 vec1, real4 vec2) {
+inline __device__ real4 delta(real4 vec1, real4 vec2, bool periodic, real4 periodicBoxSize, real4 invPeriodicBoxSize, 
+        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ) {
    real4 result = make_real4(vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0);
+    if (periodic)
+        APPLY_PERIODIC_TO_DELTA(result);
    result.w = result.x*result.x + result.y*result.y + result.z*result.z;
    return result;
 }
@@ -105,7 +108,7 @@ inline __device__ real4 computeCross(real4 vec1, real4 vec2) {
 * Compute the forces on groups based on the bonds.
 */
 extern "C" __global__ void computeGroupForces(unsigned long long* __restrict__ groupForce, mixed* __restrict__ energyBuffer, const real4* __restrict__ centerPositions,
-        const int* __restrict__ bondGroups
+        const int* __restrict__ bondGroups, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
        EXTRA_ARGS) {
    mixed energy = 0;
    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_BONDS; index += blockDim.x*gridDim.x) {

--- a/platforms/cuda/src/kernels/customCompoundBond.cu
+++ b/platforms/cuda/src/kernels/customCompoundBond.cu
@@ -8,8 +8,11 @@ inline __device__ real3 ccb_trim(real4 v) {
 /**
 * Compute the difference between two vectors, setting the fourth component to the squared magnitude.
 */
-inline __device__ real4 ccb_delta(real4 vec1, real4 vec2) {
+inline __device__ real4 ccb_delta(real4 vec1, real4 vec2, bool periodic, real4 periodicBoxSize, real4 invPeriodicBoxSize, 
+        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ) {
    real4 result = make_real4(vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0);
+    if (periodic)
+        APPLY_PERIODIC_TO_DELTA(result);
    result.w = result.x*result.x + result.y*result.y + result.z*result.z;
    return result;
 }

--- a/platforms/cuda/src/kernels/customManyParticle.cu
+++ b/platforms/cuda/src/kernels/customManyParticle.cu
@@ -59,7 +59,7 @@ inline __device__ real4 computeCross(real4 vec1, real4 vec2) {
 /**
 * Determine whether a particular interaction is in the list of exclusions.
 */
-inline __device__ bool isInteractionExcluded(int atom1, int atom2, int* __restrict__ exclusions, int* __restrict__ exclusionStartIndex) {
+inline __device__ bool isInteractionExcluded(int atom1, int atom2, const int* __restrict__ exclusions, const int* __restrict__ exclusionStartIndex) {
    int first = exclusionStartIndex[atom1];
    int last = exclusionStartIndex[atom1+1];
    for (int i = last-1; i >= first; i--) {
@@ -180,7 +180,7 @@ extern "C" __global__ void findNeighbors(real4 periodicBoxSize, real4 invPeriodi
        const real4* __restrict__ posq, const real4* __restrict__ blockCenter, const real4* __restrict__ blockBoundingBox, int2* __restrict__ neighborPairs,
        int* __restrict__ numNeighborPairs, int* __restrict__ numNeighborsForAtom, int maxNeighborPairs
 #ifdef USE_EXCLUSIONS
-        , int* __restrict__ exclusions, int* __restrict__ exclusionStartIndex
+        , const int* __restrict__ exclusions, const int* __restrict__ exclusionStartIndex
 #endif
        ) {
    __shared__ real3 positionCache[FIND_NEIGHBORS_WORKGROUP_SIZE];
@@ -265,7 +265,8 @@ extern "C" __global__ void findNeighbors(real4 periodicBoxSize, real4 invPeriodi
                }
            }
        }
-        numNeighborsForAtom[atom1] = totalNeighborsForAtom1;
+        if (atom1 < NUM_ATOMS)
+            numNeighborsForAtom[atom1] = totalNeighborsForAtom1;
    }
 }

@@ -308,6 +309,7 @@ extern "C" __global__ void computeNeighborStartIndices(int* __restrict__ numNeig
            numNeighborsForAtom[globalIndex] = 0; // Clear this so the next kernel can use it as a counter
        }
        globalOffset += posBuffer[blockDim.x-1];
+        __syncthreads();
    }
    if (threadIdx.x == 0)
        neighborStartIndex[0] = 0;

--- a/platforms/cuda/src/kernels/pme.cu
+++ b/platforms/cuda/src/kernels/pme.cu
@@ -83,7 +83,7 @@ extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, real
 #ifdef USE_DOUBLE_PRECISION
                    unsigned long long * ulonglong_p = (unsigned long long *) originalPmeGrid;
                    atomicAdd(&ulonglong_p[index],  static_cast<unsigned long long>((long long) (add*0x100000000)));
-#elif __CUDA_ARCH__ < 200
+#elif __CUDA_ARCH__ < 200 || defined(USE_DETERMINISTIC_FORCES)
                    unsigned long long * ulonglong_p = (unsigned long long *) originalPmeGrid;
                    int gridIndex = index;
                    gridIndex = (gridIndex%2 == 0 ? gridIndex/2 : (gridIndex+GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z)/2);

--- a/platforms/cuda/src/kernels/torsionForce.cu
+++ b/platforms/cuda/src/kernels/torsionForce.cu
@@ -2,6 +2,11 @@ const real PI = (real) 3.14159265358979323846;
 real3 v0 = make_real3(pos1.x-pos2.x, pos1.y-pos2.y, pos1.z-pos2.z);
 real3 v1 = make_real3(pos3.x-pos2.x, pos3.y-pos2.y, pos3.z-pos2.z);
 real3 v2 = make_real3(pos3.x-pos4.x, pos3.y-pos4.y, pos3.z-pos4.z);
+#if APPLY_PERIODIC
+APPLY_PERIODIC_TO_DELTA(v0)
+APPLY_PERIODIC_TO_DELTA(v1)
+APPLY_PERIODIC_TO_DELTA(v2)
+#endif
 real3 cp0 = cross(v0, v1);
 real3 cp1 = cross(v1, v2);
 real cosangle = dot(normalize(cp0), normalize(cp1));

--- a/platforms/cuda/tests/CudaTests.h
+++ b/platforms/cuda/tests/CudaTests.h
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2015 Stanford University and the Authors.           *
+ * Portions copyright (c) 2015-2016 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -39,5 +39,5 @@ OpenMM::CudaPlatform platform;

 void initializeTests(int argc, char* argv[]) {
    if (argc > 1)
-        platform.setPropertyDefaultValue("CudaPrecision", std::string(argv[1]));
+        platform.setPropertyDefaultValue("Precision", std::string(argv[1]));
 }