resolved conflict

73183c61 · ChayaSt · 0e218233 · 32e08b87 · 73183c61 · 73183c61
Commit 73183c61 authored May 31, 2016 by ChayaSt
20 changed files
--- a/platforms/cuda/src/CudaContext.cpp
+++ b/platforms/cuda/src/CudaContext.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2015 Stanford University and the Authors.      *
+ * Portions copyright (c) 2009-2016 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -106,7 +106,7 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
        useMixedPrecision = false;
    }
    else
-        throw OpenMMException("Illegal value for CudaPrecision: "+precision);
+        throw OpenMMException("Illegal value for Precision: "+precision);
    char* cacheVariable = getenv("OPENMM_CACHE_DIR");
    cacheDir = (cacheVariable == NULL ? tempDir : string(cacheVariable));
 #ifdef WIN32
@@ -121,7 +121,7 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
    string errorMessage = "Error initializing Context";
    CHECK_RESULT(cuDeviceGetCount(&numDevices));
    if (deviceIndex < -1 || deviceIndex >= numDevices)
-        throw OpenMMException("Illegal value for CudaDeviceIndex: "+intToString(deviceIndex));
+        throw OpenMMException("Illegal value for DeviceIndex: "+intToString(deviceIndex));
    vector<int> devicePrecedence;
    if (deviceIndex == -1) {
@@ -1131,7 +1131,6 @@ void CudaContext::reorderAtoms() {
        reorderAtomsImpl<float, float4, double, double4>();
    else
        reorderAtomsImpl<float, float4, float, float4>();
-    nonbonded->updateNeighborListSize();
 }
 template <class Real, class Real4, class Mixed, class Mixed4>

--- a/platforms/cuda/src/CudaExpressionUtilities.cpp
+++ b/platforms/cuda/src/CudaExpressionUtilities.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2015 Stanford University and the Authors.      *
+ * Portions copyright (c) 2009-2016 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -174,8 +174,8 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express
                    out << "if (x >= " << paramsFloat[2] << " && x <= " << paramsFloat[3] << " && y >= " << paramsFloat[4] << " && y <= " << paramsFloat[5] << ") {\n";
                    out << "x = (x - " << paramsFloat[2] << ")*" << paramsFloat[6] << ";\n";
                    out << "y = (y - " << paramsFloat[4] << ")*" << paramsFloat[7] << ";\n";
-                    out << "int s = min((int) floor(x), " << paramsInt[0] << ");\n";
+                    out << "int s = min((int) floor(x), " << paramsInt[0] << "-1);\n";
-                    out << "int t = min((int) floor(y), " << paramsInt[1] << ");\n";
+                    out << "int t = min((int) floor(y), " << paramsInt[1] << "-1);\n";
                    out << "int coeffIndex = 4*(s+" << paramsInt[0] << "*t);\n";
                    out << "float4 c[4];\n";
                    for (int j = 0; j < 4; j++)
@@ -217,9 +217,9 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express
                    out << "x = (x - " << paramsFloat[3] << ")*" << paramsFloat[9] << ";\n";
                    out << "y = (y - " << paramsFloat[5] << ")*" << paramsFloat[10] << ";\n";
                    out << "z = (z - " << paramsFloat[7] << ")*" << paramsFloat[11] << ";\n";
-                    out << "int s = min((int) floor(x), " << paramsInt[0] << ");\n";
+                    out << "int s = min((int) floor(x), " << paramsInt[0] << "-1);\n";
-                    out << "int t = min((int) floor(y), " << paramsInt[1] << ");\n";
+                    out << "int t = min((int) floor(y), " << paramsInt[1] << "-1);\n";
-                    out << "int u = min((int) floor(z), " << paramsInt[2] << ");\n";
+                    out << "int u = min((int) floor(z), " << paramsInt[2] << "-1);\n";
                    out << "int coeffIndex = 16*(s+" << paramsInt[0] << "*(t+" << paramsInt[1] << "*u));\n";
                    out << "float4 c[16];\n";
                    for (int j = 0; j < 16; j++)
@@ -254,7 +254,7 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express
                            for (int k = 3; k >= 0; k--)
                                for (int m = 0; m < 4; m++) {
                                    int base = 4*m;
-                                    string suffix = suffixes[m];
+                                    string suffix = suffixes[k];
                                    out << "derivy[" << m << "] = da*derivy[" << m << "] + (3*c[" << (base+3) << "]" << suffix << "*db + 2*c[" << (base+2) << "]" << suffix << ")*db + c[" << (base+1) << "]" << suffix << ";\n";
                                }
                            out << nodeNames[j] << " = derivy[0] + dc*(derivy[1] + dc*(derivy[2] + dc*derivy[3]));\n";
@@ -271,7 +271,7 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express
                            out << nodeNames[j] << " *= " << paramsFloat[11] << ";\n";
                        }
                        else
-                            throw OpenMMException("Unsupported derivative order for Continuous2DFunction");
+                            throw OpenMMException("Unsupported derivative order for Continuous3DFunction");
                    }
                    out << "}\n";
                }

--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
@@ -94,6 +94,7 @@ void CudaCalcForcesAndEnergyKernel::initialize(const System& system) {
 }
 void CudaCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForces, bool includeEnergy, int groups) {
+    cu.setForcesValid(true);
    cu.setAsCurrent();
    cu.clearAutoclearBuffers();
    for (vector<CudaContext::ForcePreComputation*>::iterator iter = cu.getPreComputations().begin(); iter != cu.getPreComputations().end(); ++iter)
@@ -125,6 +126,8 @@ double CudaCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, bo
                sum += energy[i];
        }
    }
+    if (!cu.getForcesValid())
+        valid = false;
    return sum;
 }
@@ -340,10 +343,27 @@ void CudaUpdateStateDataKernel::getPeriodicBoxVectors(ContextImpl& context, Vec3
    cu.getPeriodicBoxVectors(a, b, c);
 }
-void CudaUpdateStateDataKernel::setPeriodicBoxVectors(ContextImpl& context, const Vec3& a, const Vec3& b, const Vec3& c) const {
+void CudaUpdateStateDataKernel::setPeriodicBoxVectors(ContextImpl& context, const Vec3& a, const Vec3& b, const Vec3& c) {
    vector<CudaContext*>& contexts = cu.getPlatformData().contexts;
+    // If any particles have been wrapped to the first periodic box, we need to unwrap them
+    // to avoid changing their positions.
+    vector<Vec3> positions;
+    for (int i = 0; i < (int) cu.getPosCellOffsets().size(); i++) {
+        int4& offset = cu.getPosCellOffsets()[i];
+        if (offset.x != 0 || offset.y != 0 || offset.z != 0) {
+            getPositions(context, positions);
+            break;
+        }
+    }
+    // Update the vectors.
    for (int i = 0; i < (int) contexts.size(); i++)
        contexts[i]->setPeriodicBoxVectors(a, b, c);
+    if (positions.size() > 0)
+        setPositions(context, positions);
 }
 void CudaUpdateStateDataKernel::createCheckpoint(ContextImpl& context, ostream& stream) {
@@ -502,6 +522,7 @@ void CudaCalcHarmonicBondForceKernel::initialize(const System& system, const Har
    }
    params->upload(paramVector);
    map<string, string> replacements;
+    replacements["APPLY_PERIODIC"] = (force.usesPeriodicBoundaryConditions() ? "1" : "0");
    replacements["COMPUTE_FORCE"] = CudaKernelSources::harmonicBondForce;
    replacements["PARAMS"] = cu.getBondedUtilities().addArgument(params->getDevicePointer(), "float2");
    cu.getBondedUtilities().addInteraction(atoms, cu.replaceStrings(CudaKernelSources::bondForce, replacements), force.getForceGroup());
@@ -638,6 +659,7 @@ void CudaCalcCustomBondForceKernel::initialize(const System& system, const Custo
    vector<pair<string, string> > functionNames;
    compute << cu.getExpressionUtilities().createExpressions(expressions, variables, functions, functionNames, "temp");
    map<string, string> replacements;
+    replacements["APPLY_PERIODIC"] = (force.usesPeriodicBoundaryConditions() ? "1" : "0");
    replacements["COMPUTE_FORCE"] = compute.str();
    cu.getBondedUtilities().addInteraction(atoms, cu.replaceStrings(CudaKernelSources::bondForce, replacements), force.getForceGroup());
 }
@@ -737,6 +759,7 @@ void CudaCalcHarmonicAngleForceKernel::initialize(const System& system, const Ha
    }
    params->upload(paramVector);
    map<string, string> replacements;
+    replacements["APPLY_PERIODIC"] = (force.usesPeriodicBoundaryConditions() ? "1" : "0");
    replacements["COMPUTE_FORCE"] = CudaKernelSources::harmonicAngleForce;
    replacements["PARAMS"] = cu.getBondedUtilities().addArgument(params->getDevicePointer(), "float2");
    cu.getBondedUtilities().addInteraction(atoms, cu.replaceStrings(CudaKernelSources::angleForce, replacements), force.getForceGroup());
@@ -874,6 +897,7 @@ void CudaCalcCustomAngleForceKernel::initialize(const System& system, const Cust
    vector<pair<string, string> > functionNames;
    compute << cu.getExpressionUtilities().createExpressions(expressions, variables, functions, functionNames, "temp");
    map<string, string> replacements;
+    replacements["APPLY_PERIODIC"] = (force.usesPeriodicBoundaryConditions() ? "1" : "0");
    replacements["COMPUTE_FORCE"] = compute.str();
    cu.getBondedUtilities().addInteraction(atoms, cu.replaceStrings(CudaKernelSources::angleForce, replacements), force.getForceGroup());
 }
@@ -974,6 +998,7 @@ void CudaCalcPeriodicTorsionForceKernel::initialize(const System& system, const
    }
    params->upload(paramVector);
    map<string, string> replacements;
+    replacements["APPLY_PERIODIC"] = (force.usesPeriodicBoundaryConditions() ? "1" : "0");
    replacements["COMPUTE_FORCE"] = CudaKernelSources::periodicTorsionForce;
    replacements["PARAMS"] = cu.getBondedUtilities().addArgument(params->getDevicePointer(), "float4");
    cu.getBondedUtilities().addInteraction(atoms, cu.replaceStrings(CudaKernelSources::torsionForce, replacements), force.getForceGroup());
@@ -1069,6 +1094,7 @@ void CudaCalcRBTorsionForceKernel::initialize(const System& system, const RBTors
    params1->upload(paramVector1);
    params2->upload(paramVector2);
    map<string, string> replacements;
+    replacements["APPLY_PERIODIC"] = (force.usesPeriodicBoundaryConditions() ? "1" : "0");
    replacements["COMPUTE_FORCE"] = CudaKernelSources::rbTorsionForce;
    replacements["PARAMS1"] = cu.getBondedUtilities().addArgument(params1->getDevicePointer(), "float4");
    replacements["PARAMS2"] = cu.getBondedUtilities().addArgument(params2->getDevicePointer(), "float2");
@@ -1186,6 +1212,7 @@ void CudaCalcCMAPTorsionForceKernel::initialize(const System& system, const CMAP
    mapPositions->upload(mapPositionsVec);
    torsionMaps->upload(torsionMapsVec);
    map<string, string> replacements;
+    replacements["APPLY_PERIODIC"] = (force.usesPeriodicBoundaryConditions() ? "1" : "0");
    replacements["COEFF"] = cu.getBondedUtilities().addArgument(coefficients->getDevicePointer(), "float4");
    replacements["MAP_POS"] = cu.getBondedUtilities().addArgument(mapPositions->getDevicePointer(), "int2");
    replacements["MAPS"] = cu.getBondedUtilities().addArgument(torsionMaps->getDevicePointer(), "int");
@@ -1341,6 +1368,7 @@ void CudaCalcCustomTorsionForceKernel::initialize(const System& system, const Cu
    vector<pair<string, string> > functionNames;
    compute << cu.getExpressionUtilities().createExpressions(expressions, variables, functions, functionNames, "temp");
    map<string, string> replacements;
+    replacements["APPLY_PERIODIC"] = (force.usesPeriodicBoundaryConditions() ? "1" : "0");
    replacements["COMPUTE_FORCE"] = compute.str();
    cu.getBondedUtilities().addInteraction(atoms, cu.replaceStrings(CudaKernelSources::torsionForce, replacements), force.getForceGroup());
 }
@@ -1693,6 +1721,8 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
                pmeDefines["USE_DOUBLE_PRECISION"] = "1";
            if (usePmeStream)
                pmeDefines["USE_PME_STREAM"] = "1";
+            if (cu.getPlatformData().deterministicForces)
+                pmeDefines["USE_DETERMINISTIC_FORCES"] = "1";
            CUmodule module = cu.createModule(CudaKernelSources::vectorOps+CudaKernelSources::pme, pmeDefines);
            if (cu.getPlatformData().useCpuPme) {
                // Create the CPU PME kernel.
@@ -1911,16 +1941,18 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
        // Execute the reciprocal space kernels.
        void* gridIndexArgs[] = {&cu.getPosq().getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(), cu.getPeriodicBoxSizePointer(),
+                cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
                recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
        cu.executeKernel(pmeGridIndexKernel, gridIndexArgs, cu.getNumAtoms());
        sort->sort(*pmeAtomGridIndex);
        void* spreadArgs[] = {&cu.getPosq().getDevicePointer(), &directPmeGrid->getDevicePointer(), cu.getPeriodicBoxSizePointer(),
+                cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
                recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex->getDevicePointer()};
        cu.executeKernel(pmeSpreadChargeKernel, spreadArgs, cu.getNumAtoms(), 128);
-        if (cu.getUseDoublePrecision() || cu.getComputeCapability() < 2.0) {
+        if (cu.getUseDoublePrecision() || cu.getComputeCapability() < 2.0 || cu.getPlatformData().deterministicForces) {
            void* finishSpreadArgs[] = {&directPmeGrid->getDevicePointer()};
            cu.executeKernel(pmeFinishSpreadChargeKernel, finishSpreadArgs, directPmeGrid->getSize());
        }
@@ -1957,8 +1989,9 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
            fft->execFFT(*reciprocalPmeGrid, *directPmeGrid, false);
        }
-        void* interpolateArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &directPmeGrid->getDevicePointer(),
+        void* interpolateArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &directPmeGrid->getDevicePointer(), cu.getPeriodicBoxSizePointer(),
-                cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex->getDevicePointer()};
+                cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
+                recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex->getDevicePointer()};
        cu.executeKernel(pmeInterpolateForceKernel, interpolateArgs, cu.getNumAtoms(), 128);
        if (usePmeStream) {
            cuEventRecord(pmeSyncEvent, pmeStream);
@@ -2157,6 +2190,7 @@ void CudaCalcCustomNonbondedForceKernel::initialize(const System& system, const
    map<string, Lepton::CustomFunction*> functions;
    vector<pair<string, string> > functionDefinitions;
    vector<const TabulatedFunction*> functionList;
+    vector<string> tableTypes;
    for (int i = 0; i < force.getNumTabulatedFunctions(); i++) {
        functionList.push_back(&force.getTabulatedFunction(i));
        string name = force.getTabulatedFunctionName(i);
@@ -2168,6 +2202,10 @@ void CudaCalcCustomNonbondedForceKernel::initialize(const System& system, const
        tabulatedFunctions.push_back(CudaArray::create<float>(cu, f.size(), "TabulatedFunction"));
        tabulatedFunctions[tabulatedFunctions.size()-1]->upload(f);
        cu.getNonbondedUtilities().addArgument(CudaNonbondedUtilities::ParameterInfo(arrayName, "float", width, width*sizeof(float), tabulatedFunctions[tabulatedFunctions.size()-1]->getDevicePointer()));
+        if (width == 1)
+            tableTypes.push_back("float");
+        else
+            tableTypes.push_back("float"+cu.intToString(width));
    }
    // Record information for the expressions.
@@ -2220,7 +2258,7 @@ void CudaCalcCustomNonbondedForceKernel::initialize(const System& system, const
    }
    string source = cu.replaceStrings(CudaKernelSources::customNonbonded, replacements);
    if (force.getNumInteractionGroups() > 0)
-        initInteractionGroups(force, source);
+        initInteractionGroups(force, source, tableTypes);
    else {
        cu.getNonbondedUtilities().addInteraction(useCutoff, usePeriodic, true, force.getCutoffDistance(), exclusionList, source, force.getForceGroup());
        for (int i = 0; i < (int) params->getBuffers().size(); i++) {
@@ -2246,7 +2284,7 @@ void CudaCalcCustomNonbondedForceKernel::initialize(const System& system, const
    }
 }
-void CudaCalcCustomNonbondedForceKernel::initInteractionGroups(const CustomNonbondedForce& force, const string& interactionSource) {
+void CudaCalcCustomNonbondedForceKernel::initInteractionGroups(const CustomNonbondedForce& force, const string& interactionSource, const vector<string>& tableTypes) {
    // Process groups to form tiles.
    vector<vector<int> > atomLists;
@@ -2429,6 +2467,8 @@ void CudaCalcCustomNonbondedForceKernel::initInteractionGroups(const CustomNonbo
    stringstream args;
    for (int i = 0; i < (int) buffers.size(); i++)
        args<<", const "<<buffers[i].getType()<<"* __restrict__ global_params"<<(i+1);
+    for (int i = 0; i < (int) tabulatedFunctions.size(); i++)
+        args << ", const " << tableTypes[i]<< "* __restrict__ table" << i;
    if (globals != NULL)
        args<<", const float* __restrict__ globals";
    replacements["PARAMETER_ARGUMENTS"] = args.str();
@@ -2519,6 +2559,8 @@ double CudaCalcCustomNonbondedForceKernel::execute(ContextImpl& context, bool in
            interactionGroupArgs.push_back(cu.getPeriodicBoxVecZPointer());
            for (int i = 0; i < (int) params->getBuffers().size(); i++)
                interactionGroupArgs.push_back(&params->getBuffers()[i].getMemory());
+            for (int i = 0; i < (int) tabulatedFunctions.size(); i++)
+                interactionGroupArgs.push_back(&tabulatedFunctions[i]->getDevicePointer());
            if (globals != NULL)
                interactionGroupArgs.push_back(&globals->getDevicePointer());
        }
@@ -4446,6 +4488,11 @@ void CudaCalcCustomCentroidBondForceKernel::initialize(const System& system, con
    groupForcesArgs.push_back(NULL); // Energy buffer hasn't been created yet
    groupForcesArgs.push_back(&centerPositions->getDevicePointer());
    groupForcesArgs.push_back(&bondGroups->getDevicePointer());
+    groupForcesArgs.push_back(cu.getPeriodicBoxSizePointer());
+    groupForcesArgs.push_back(cu.getInvPeriodicBoxSizePointer());
+    groupForcesArgs.push_back(cu.getPeriodicBoxVecXPointer());
+    groupForcesArgs.push_back(cu.getPeriodicBoxVecYPointer());
+    groupForcesArgs.push_back(cu.getPeriodicBoxVecZPointer());
    // Record the tabulated functions.
@@ -4526,7 +4573,7 @@ void CudaCalcCustomCentroidBondForceKernel::initialize(const System& system, con
        const vector<int>& groups = iter->second;
        string deltaName = atomNames[groups[0]]+atomNames[groups[1]];
        if (computedDeltas.count(deltaName) == 0) {
-            compute<<"real4 delta"<<deltaName<<" = delta("<<posNames[groups[0]]<<", "<<posNames[groups[1]]<<");\n";
+            compute<<"real4 delta"<<deltaName<<" = delta("<<posNames[groups[0]]<<", "<<posNames[groups[1]]<<", "<<force.usesPeriodicBoundaryConditions()<<", periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ);\n";
            computedDeltas.insert(deltaName);
        }
        compute<<"real r_"<<deltaName<<" = sqrt(delta"<<deltaName<<".w);\n";
@@ -4540,11 +4587,11 @@ void CudaCalcCustomCentroidBondForceKernel::initialize(const System& system, con
        string deltaName2 = atomNames[groups[1]]+atomNames[groups[2]];
        string angleName = "angle_"+atomNames[groups[0]]+atomNames[groups[1]]+atomNames[groups[2]];
        if (computedDeltas.count(deltaName1) == 0) {
-            compute<<"real4 delta"<<deltaName1<<" = delta("<<posNames[groups[1]]<<", "<<posNames[groups[0]]<<");\n";
+            compute<<"real4 delta"<<deltaName1<<" = delta("<<posNames[groups[1]]<<", "<<posNames[groups[0]]<<", "<<force.usesPeriodicBoundaryConditions()<<", periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ);\n";
            computedDeltas.insert(deltaName1);
        }
        if (computedDeltas.count(deltaName2) == 0) {
-            compute<<"real4 delta"<<deltaName2<<" = delta("<<posNames[groups[1]]<<", "<<posNames[groups[2]]<<");\n";
+            compute<<"real4 delta"<<deltaName2<<" = delta("<<posNames[groups[1]]<<", "<<posNames[groups[2]]<<", "<<force.usesPeriodicBoundaryConditions()<<", periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ);\n";
            computedDeltas.insert(deltaName2);
        }
        compute<<"real "<<angleName<<" = computeAngle(delta"<<deltaName1<<", delta"<<deltaName2<<");\n";
@@ -4561,15 +4608,15 @@ void CudaCalcCustomCentroidBondForceKernel::initialize(const System& system, con
        string crossName2 = "cross_"+deltaName2+"_"+deltaName3;
        string dihedralName = "dihedral_"+atomNames[groups[0]]+atomNames[groups[1]]+atomNames[groups[2]]+atomNames[groups[3]];
        if (computedDeltas.count(deltaName1) == 0) {
-            compute<<"real4 delta"<<deltaName1<<" = delta("<<posNames[groups[0]]<<", "<<posNames[groups[1]]<<");\n";
+            compute<<"real4 delta"<<deltaName1<<" = delta("<<posNames[groups[0]]<<", "<<posNames[groups[1]]<<", "<<force.usesPeriodicBoundaryConditions()<<", periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ);\n";
            computedDeltas.insert(deltaName1);
        }
        if (computedDeltas.count(deltaName2) == 0) {
-            compute<<"real4 delta"<<deltaName2<<" = delta("<<posNames[groups[2]]<<", "<<posNames[groups[1]]<<");\n";
+            compute<<"real4 delta"<<deltaName2<<" = delta("<<posNames[groups[2]]<<", "<<posNames[groups[1]]<<", "<<force.usesPeriodicBoundaryConditions()<<", periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ);\n";
            computedDeltas.insert(deltaName2);
        }
        if (computedDeltas.count(deltaName3) == 0) {
-            compute<<"real4 delta"<<deltaName3<<" = delta("<<posNames[groups[2]]<<", "<<posNames[groups[3]]<<");\n";
+            compute<<"real4 delta"<<deltaName3<<" = delta("<<posNames[groups[2]]<<", "<<posNames[groups[3]]<<", "<<force.usesPeriodicBoundaryConditions()<<", periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ);\n";
            computedDeltas.insert(deltaName3);
        }
        compute<<"real4 "<<crossName1<<" = computeCross(delta"<<deltaName1<<", delta"<<deltaName2<<");\n";
@@ -4860,7 +4907,7 @@ void CudaCalcCustomCompoundBondForceKernel::initialize(const System& system, con
        const vector<int>& atoms = iter->second;
        string deltaName = atomNames[atoms[0]]+atomNames[atoms[1]];
        if (computedDeltas.count(deltaName) == 0) {
-            compute<<"real4 delta"<<deltaName<<" = ccb_delta("<<posNames[atoms[0]]<<", "<<posNames[atoms[1]]<<");\n";
+            compute<<"real4 delta"<<deltaName<<" = ccb_delta("<<posNames[atoms[0]]<<", "<<posNames[atoms[1]]<<", "<<force.usesPeriodicBoundaryConditions()<<", periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ);\n";
            computedDeltas.insert(deltaName);
        }
        compute<<"real r_"<<deltaName<<" = sqrt(delta"<<deltaName<<".w);\n";
@@ -4874,11 +4921,11 @@ void CudaCalcCustomCompoundBondForceKernel::initialize(const System& system, con
        string deltaName2 = atomNames[atoms[1]]+atomNames[atoms[2]];
        string angleName = "angle_"+atomNames[atoms[0]]+atomNames[atoms[1]]+atomNames[atoms[2]];
        if (computedDeltas.count(deltaName1) == 0) {
-            compute<<"real4 delta"<<deltaName1<<" = ccb_delta("<<posNames[atoms[1]]<<", "<<posNames[atoms[0]]<<");\n";
+            compute<<"real4 delta"<<deltaName1<<" = ccb_delta("<<posNames[atoms[1]]<<", "<<posNames[atoms[0]]<<", "<<force.usesPeriodicBoundaryConditions()<<", periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ);\n";
            computedDeltas.insert(deltaName1);
        }
        if (computedDeltas.count(deltaName2) == 0) {
-            compute<<"real4 delta"<<deltaName2<<" = ccb_delta("<<posNames[atoms[1]]<<", "<<posNames[atoms[2]]<<");\n";
+            compute<<"real4 delta"<<deltaName2<<" = ccb_delta("<<posNames[atoms[1]]<<", "<<posNames[atoms[2]]<<", "<<force.usesPeriodicBoundaryConditions()<<", periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ);\n";
            computedDeltas.insert(deltaName2);
        }
        compute<<"real "<<angleName<<" = ccb_computeAngle(delta"<<deltaName1<<", delta"<<deltaName2<<");\n";
@@ -4895,15 +4942,15 @@ void CudaCalcCustomCompoundBondForceKernel::initialize(const System& system, con
        string crossName2 = "cross_"+deltaName2+"_"+deltaName3;
        string dihedralName = "dihedral_"+atomNames[atoms[0]]+atomNames[atoms[1]]+atomNames[atoms[2]]+atomNames[atoms[3]];
        if (computedDeltas.count(deltaName1) == 0) {
-            compute<<"real4 delta"<<deltaName1<<" = ccb_delta("<<posNames[atoms[0]]<<", "<<posNames[atoms[1]]<<");\n";
+            compute<<"real4 delta"<<deltaName1<<" = ccb_delta("<<posNames[atoms[0]]<<", "<<posNames[atoms[1]]<<", "<<force.usesPeriodicBoundaryConditions()<<", periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ);\n";
            computedDeltas.insert(deltaName1);
        }
        if (computedDeltas.count(deltaName2) == 0) {
-            compute<<"real4 delta"<<deltaName2<<" = ccb_delta("<<posNames[atoms[2]]<<", "<<posNames[atoms[1]]<<");\n";
+            compute<<"real4 delta"<<deltaName2<<" = ccb_delta("<<posNames[atoms[2]]<<", "<<posNames[atoms[1]]<<", "<<force.usesPeriodicBoundaryConditions()<<", periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ);\n";
            computedDeltas.insert(deltaName2);
        }
        if (computedDeltas.count(deltaName3) == 0) {
-            compute<<"real4 delta"<<deltaName3<<" = ccb_delta("<<posNames[atoms[2]]<<", "<<posNames[atoms[3]]<<");\n";
+            compute<<"real4 delta"<<deltaName3<<" = ccb_delta("<<posNames[atoms[2]]<<", "<<posNames[atoms[3]]<<", "<<force.usesPeriodicBoundaryConditions()<<", periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ);\n";
            computedDeltas.insert(deltaName3);
        }
        compute<<"real4 "<<crossName1<<" = ccb_computeCross(delta"<<deltaName1<<", delta"<<deltaName2<<");\n";
@@ -6236,9 +6283,9 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context,
        for (int step = 0; step < numSteps; step++) {
            string expr;
            integrator.getComputationStep(step, stepType[step], variable[step], expr);
-            if (stepType[step] == CustomIntegrator::BeginWhileBlock)
+            if (stepType[step] == CustomIntegrator::WhileBlockStart)
                blockEnd[blockEnd[step]] = step; // Record where to branch back to.
-            if (stepType[step] == CustomIntegrator::ComputeGlobal || stepType[step] == CustomIntegrator::BeginIfBlock || stepType[step] == CustomIntegrator::BeginWhileBlock)
+            if (stepType[step] == CustomIntegrator::ComputeGlobal || stepType[step] == CustomIntegrator::IfBlockStart || stepType[step] == CustomIntegrator::WhileBlockStart)
                for (int i = 0; i < (int) expression[step].size(); i++)
                    globalExpressions[step].push_back(expression[step][i].createCompiledExpression());
        }
@@ -6685,15 +6732,15 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
        else if (stepType[step] == CustomIntegrator::ConstrainVelocities) {
            cu.getIntegrationUtilities().applyVelocityConstraints(integrator.getConstraintTolerance());
        }
-        else if (stepType[step] == CustomIntegrator::BeginIfBlock) {
+        else if (stepType[step] == CustomIntegrator::IfBlockStart) {
            if (!evaluateCondition(step))
                nextStep = blockEnd[step]+1;
        }
-        else if (stepType[step] == CustomIntegrator::BeginWhileBlock) {
+        else if (stepType[step] == CustomIntegrator::WhileBlockStart) {
            if (!evaluateCondition(step))
                nextStep = blockEnd[step]+1;
        }
-        else if (stepType[step] == CustomIntegrator::EndBlock) {
+        else if (stepType[step] == CustomIntegrator::BlockEnd) {
            if (blockEnd[step] != -1)
                nextStep = blockEnd[step]; // Return to the start of a while block.
        }
@@ -6958,7 +7005,7 @@ void CudaApplyMonteCarloBarostatKernel::scaleCoordinates(ContextImpl& context, d
        std::stringstream m;
        m<<"Error saving positions for MC barostat: "<<cu.getErrorString(result)<<" ("<<result<<")";
        throw OpenMMException(m.str());
-    }        
+    }
    float scalefX = (float) scaleX;
    float scalefY = (float) scaleY;
    float scalefZ = (float) scaleZ;

--- a/platforms/cuda/src/CudaNonbondedUtilities.cpp
+++ b/platforms/cuda/src/CudaNonbondedUtilities.cpp
@@ -65,12 +65,14 @@ private:
 CudaNonbondedUtilities::CudaNonbondedUtilities(CudaContext& context) : context(context), useCutoff(false), usePeriodic(false), anyExclusions(false), usePadding(true),
        exclusionIndices(NULL), exclusionRowIndices(NULL), exclusionTiles(NULL), exclusions(NULL), interactingTiles(NULL), interactingAtoms(NULL),
        interactionCount(NULL), blockCenter(NULL), blockBoundingBox(NULL), sortedBlocks(NULL), sortedBlockCenter(NULL), sortedBlockBoundingBox(NULL),
-        oldPositions(NULL), rebuildNeighborList(NULL), blockSorter(NULL), forceRebuildNeighborList(true), lastCutoff(0.0), groupFlags(0) {
+        oldPositions(NULL), rebuildNeighborList(NULL), blockSorter(NULL), pinnedCountBuffer(NULL), forceRebuildNeighborList(true), lastCutoff(0.0), groupFlags(0) {
    // Decide how many thread blocks to use.
    string errorMessage = "Error initializing nonbonded utilities";
    int multiprocessors;
    CHECK_RESULT(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, context.getDevice()));
+    CHECK_RESULT(cuEventCreate(&downloadCountEvent, 0));
+    CHECK_RESULT(cuMemHostAlloc((void**) &pinnedCountBuffer, sizeof(int), CU_MEMHOSTALLOC_PORTABLE));
    numForceThreadBlocks = 4*multiprocessors;
    forceThreadBlockSize = (context.getComputeCapability() < 2.0 ? 128 : 256);
 }
@@ -106,6 +108,9 @@ CudaNonbondedUtilities::~CudaNonbondedUtilities() {
        delete rebuildNeighborList;
    if (blockSorter != NULL)
        delete blockSorter;
+    if (pinnedCountBuffer != NULL)
+        cuMemFreeHost(pinnedCountBuffer);
+    cuEventDestroy(downloadCountEvent);
 }
 void CudaNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector<vector<int> >& exclusionList, const string& kernel, int forceGroup) {
@@ -375,17 +380,14 @@ void CudaNonbondedUtilities::prepareInteractions(int forceGroups) {
    if (lastCutoff != kernels.cutoffDistance)
        forceRebuildNeighborList = true;
-    bool rebuild = false;
+    context.executeKernel(kernels.findBlockBoundsKernel, &findBlockBoundsArgs[0], context.getNumAtoms());
-    do {
+    blockSorter->sort(*sortedBlocks);
-        context.executeKernel(kernels.findBlockBoundsKernel, &findBlockBoundsArgs[0], context.getNumAtoms());
+    context.executeKernel(kernels.sortBoxDataKernel, &sortBoxDataArgs[0], context.getNumAtoms());
-        blockSorter->sort(*sortedBlocks);
+    context.executeKernel(kernels.findInteractingBlocksKernel, &findInteractingBlocksArgs[0], context.getNumAtoms(), 256);
-        context.executeKernel(kernels.sortBoxDataKernel, &sortBoxDataArgs[0], context.getNumAtoms());
+    forceRebuildNeighborList = false;
-        context.executeKernel(kernels.findInteractingBlocksKernel, &findInteractingBlocksArgs[0], context.getNumAtoms(), 256);
-        forceRebuildNeighborList = false;
-        if (context.getComputeForceCount() == 1)
-            rebuild = updateNeighborListSize(); // This is the first time step, so check whether our initial guess was large enough.
-    } while(rebuild);
    lastCutoff = kernels.cutoffDistance;
+    interactionCount->download(pinnedCountBuffer, false);
+    cuEventRecord(downloadCountEvent, context.getCurrentStream());
 }
 void CudaNonbondedUtilities::computeInteractions(int forceGroups, bool includeForces, bool includeEnergy) {
@@ -398,20 +400,22 @@ void CudaNonbondedUtilities::computeInteractions(int forceGroups, bool includeFo
            kernel = createInteractionKernel(kernels.source, parameters, arguments, true, true, forceGroups, includeForces, includeEnergy);
        context.executeKernel(kernel, &forceArgs[0], numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
    }
+    if (useCutoff && numTiles > 0) {
+        cuEventSynchronize(downloadCountEvent);
+        updateNeighborListSize();
+    }
 }
 bool CudaNonbondedUtilities::updateNeighborListSize() {
    if (!useCutoff)
        return false;
-    unsigned int* pinnedInteractionCount = (unsigned int*) context.getPinnedBuffer();
+    if (pinnedCountBuffer[0] <= (unsigned int) maxTiles)
-    interactionCount->download(pinnedInteractionCount);
-    if (pinnedInteractionCount[0] <= (unsigned int) maxTiles)
        return false;
    // The most recent timestep had too many interactions to fit in the arrays.  Make the arrays bigger to prevent
    // this from happening in the future.
-    maxTiles = (int) (1.2*pinnedInteractionCount[0]);
+    maxTiles = (int) (1.2*pinnedCountBuffer[0]);
    int totalTiles = context.getNumAtomBlocks()*(context.getNumAtomBlocks()+1)/2;
    if (maxTiles > totalTiles)
        maxTiles = totalTiles;
@@ -428,6 +432,7 @@ bool CudaNonbondedUtilities::updateNeighborListSize() {
        forceArgs[17] = &interactingAtoms->getDevicePointer();
    findInteractingBlocksArgs[7] = &interactingAtoms->getDevicePointer();
    forceRebuildNeighborList = true;
+    context.setForcesValid(false);
    return true;
 }

--- a/platforms/cuda/src/CudaPlatform.cpp
+++ b/platforms/cuda/src/CudaPlatform.cpp
@@ -62,6 +62,14 @@ extern "C" OPENMM_EXPORT_CUDA void registerPlatforms() {
 #endif
 CudaPlatform::CudaPlatform() {
+    deprecatedPropertyReplacements["CudaDeviceIndex"] = CudaDeviceIndex();
+    deprecatedPropertyReplacements["CudaDeviceName"] = CudaDeviceName();
+    deprecatedPropertyReplacements["CudaUseBlockingSync"] = CudaUseBlockingSync();
+    deprecatedPropertyReplacements["CudaPrecision"] = CudaPrecision();
+    deprecatedPropertyReplacements["CudaUseCpuPme"] = CudaUseCpuPme();
+    deprecatedPropertyReplacements["CudaTempDirectory"] = CudaTempDirectory();
+    deprecatedPropertyReplacements["CudaDisablePmeStream"] = CudaDisablePmeStream();
+    deprecatedPropertyReplacements["CudaDeterministicForces"] = CudaDeterministicForces();
    CudaKernelFactory* factory = new CudaKernelFactory();
    registerKernelFactory(CalcForcesAndEnergyKernel::Name(), factory);
    registerKernelFactory(UpdateStateDataKernel::Name(), factory);
@@ -102,12 +110,14 @@ CudaPlatform::CudaPlatform() {
    platformProperties.push_back(CudaTempDirectory());
    platformProperties.push_back(CudaHostCompiler());
    platformProperties.push_back(CudaDisablePmeStream());
+    platformProperties.push_back(CudaDeterministicForces());
    setPropertyDefaultValue(CudaDeviceIndex(), "");
    setPropertyDefaultValue(CudaDeviceName(), "");
    setPropertyDefaultValue(CudaUseBlockingSync(), "true");
    setPropertyDefaultValue(CudaPrecision(), "single");
    setPropertyDefaultValue(CudaUseCpuPme(), "false");
    setPropertyDefaultValue(CudaDisablePmeStream(), "false");
+    setPropertyDefaultValue(CudaDeterministicForces(), "false");
 #ifdef _MSC_VER
    char* bindir = getenv("CUDA_BIN_PATH");
    string nvcc = (bindir == NULL ? "nvcc.exe" : string(bindir)+"\\nvcc.exe");
@@ -142,7 +152,10 @@ bool CudaPlatform::supportsDoublePrecision() const {
 const string& CudaPlatform::getPropertyValue(const Context& context, const string& property) const {
    const ContextImpl& impl = getContextImpl(context);
    const PlatformData* data = reinterpret_cast<const PlatformData*>(impl.getPlatformData());
-    map<string, string>::const_iterator value = data->propertyValues.find(property);
+    string propertyName = property;
+    if (deprecatedPropertyReplacements.find(property) != deprecatedPropertyReplacements.end())
+        propertyName = deprecatedPropertyReplacements.find(property)->second;
+    map<string, string>::const_iterator value = data->propertyValues.find(propertyName);
    if (value != data->propertyValues.end())
        return value->second;
    return Platform::getPropertyValue(context, property);
@@ -168,10 +181,13 @@ void CudaPlatform::contextCreated(ContextImpl& context, const map<string, string
            getPropertyDefaultValue(CudaHostCompiler()) : properties.find(CudaHostCompiler())->second);
    string pmeStreamPropValue = (properties.find(CudaDisablePmeStream()) == properties.end() ?
            getPropertyDefaultValue(CudaDisablePmeStream()) : properties.find(CudaDisablePmeStream())->second);
+    string deterministicForcesValue = (properties.find(CudaDeterministicForces()) == properties.end() ?
+            getPropertyDefaultValue(CudaDeterministicForces()) : properties.find(CudaDeterministicForces())->second);
    transform(blockingPropValue.begin(), blockingPropValue.end(), blockingPropValue.begin(), ::tolower);
    transform(precisionPropValue.begin(), precisionPropValue.end(), precisionPropValue.begin(), ::tolower);
    transform(cpuPmePropValue.begin(), cpuPmePropValue.end(), cpuPmePropValue.begin(), ::tolower);
    transform(pmeStreamPropValue.begin(), pmeStreamPropValue.end(), pmeStreamPropValue.begin(), ::tolower);
+    transform(deterministicForcesValue.begin(), deterministicForcesValue.end(), deterministicForcesValue.begin(), ::tolower);
    vector<string> pmeKernelName;
    pmeKernelName.push_back(CalcPmeReciprocalForceKernel::Name());
    if (!supportsKernels(pmeKernelName))
@@ -180,7 +196,8 @@ void CudaPlatform::contextCreated(ContextImpl& context, const map<string, string
    char* threadsEnv = getenv("OPENMM_CPU_THREADS");
    if (threadsEnv != NULL)
        stringstream(threadsEnv) >> threads;
-    context.setPlatformData(new PlatformData(&context, context.getSystem(), devicePropValue, blockingPropValue, precisionPropValue, cpuPmePropValue, compilerPropValue, tempPropValue, hostCompilerPropValue, pmeStreamPropValue, threads));
+    context.setPlatformData(new PlatformData(&context, context.getSystem(), devicePropValue, blockingPropValue, precisionPropValue, cpuPmePropValue, compilerPropValue, tempPropValue,
+            hostCompilerPropValue, pmeStreamPropValue, deterministicForcesValue, threads));
 }
 void CudaPlatform::contextDestroyed(ContextImpl& context) const {
@@ -189,7 +206,8 @@ void CudaPlatform::contextDestroyed(ContextImpl& context) const {
 }
 CudaPlatform::PlatformData::PlatformData(ContextImpl* context, const System& system, const string& deviceIndexProperty, const string& blockingProperty, const string& precisionProperty,
-            const string& cpuPmeProperty, const string& compilerProperty, const string& tempProperty, const string& hostCompilerProperty, const string& pmeStreamProperty, int numThreads) :
+            const string& cpuPmeProperty, const string& compilerProperty, const string& tempProperty, const string& hostCompilerProperty, const string& pmeStreamProperty,
+            const string& deterministicForcesProperty, int numThreads) :
                context(context), removeCM(false), stepCount(0), computeForceCount(0), time(0.0), hasInitializedContexts(false), threads(numThreads) {
    bool blocking = (blockingProperty == "true");
    vector<string> devices;
@@ -230,6 +248,7 @@ CudaPlatform::PlatformData::PlatformData(ContextImpl* context, const System& sys
    }
    useCpuPme = (cpuPmeProperty == "true" && !contexts[0]->getUseDoublePrecision());
    disablePmeStream = (pmeStreamProperty == "true");
+    deterministicForces = (deterministicForcesProperty == "true");
    propertyValues[CudaPlatform::CudaDeviceIndex()] = deviceIndex.str();
    propertyValues[CudaPlatform::CudaDeviceName()] = deviceName.str();
    propertyValues[CudaPlatform::CudaUseBlockingSync()] = blocking ? "true" : "false";
@@ -239,6 +258,7 @@ CudaPlatform::PlatformData::PlatformData(ContextImpl* context, const System& sys
    propertyValues[CudaPlatform::CudaTempDirectory()] = tempProperty;
    propertyValues[CudaPlatform::CudaHostCompiler()] = hostCompilerProperty;
    propertyValues[CudaPlatform::CudaDisablePmeStream()] = disablePmeStream ? "true" : "false";
+    propertyValues[CudaPlatform::CudaDeterministicForces()] = deterministicForces ? "true" : "false";
    contextEnergy.resize(contexts.size());
    // Determine whether peer-to-peer copying is supported, and enable it if so.

--- a/platforms/cuda/src/kernels/angleForce.cu
+++ b/platforms/cuda/src/kernels/angleForce.cu
 real3 v0 = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
 real3 v1 = make_real3(pos2.x-pos3.x, pos2.y-pos3.y, pos2.z-pos3.z);
+#if APPLY_PERIODIC
+APPLY_PERIODIC_TO_DELTA(v0)
+APPLY_PERIODIC_TO_DELTA(v1)
+#endif
 real3 cp = cross(v0, v1);
 real rp = cp.x*cp.x + cp.y*cp.y + cp.z*cp.z;
 rp = max(SQRT(rp), (real) 1.0e-06f);

--- a/platforms/cuda/src/kernels/bondForce.cu
+++ b/platforms/cuda/src/kernels/bondForce.cu
 real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
+#if APPLY_PERIODIC
+APPLY_PERIODIC_TO_DELTA(delta)
+#endif
 real r = SQRT(delta.x*delta.x + delta.y*delta.y + delta.z*delta.z);
 COMPUTE_FORCE
 dEdR = (r > 0) ? (dEdR / r) : 0;

--- a/platforms/cuda/src/kernels/cmapTorsionForce.cu
+++ b/platforms/cuda/src/kernels/cmapTorsionForce.cu
@@ -5,6 +5,11 @@ const real PI = (real) 3.14159265358979323846;
 real3 v0a = make_real3(pos1.x-pos2.x, pos1.y-pos2.y, pos1.z-pos2.z);
 real3 v1a = make_real3(pos3.x-pos2.x, pos3.y-pos2.y, pos3.z-pos2.z);
 real3 v2a = make_real3(pos3.x-pos4.x, pos3.y-pos4.y, pos3.z-pos4.z);
+#if APPLY_PERIODIC
+APPLY_PERIODIC_TO_DELTA(v0a)
+APPLY_PERIODIC_TO_DELTA(v1a)
+APPLY_PERIODIC_TO_DELTA(v2a)
+#endif
 real3 cp0a = cross(v0a, v1a);
 real3 cp1a = cross(v1a, v2a);
 real cosangle = dot(normalize(cp0a), normalize(cp1a));
@@ -28,6 +33,11 @@ angleA = fmod(angleA+2.0f*PI, 2.0f*PI);
 real3 v0b = make_real3(pos5.x-pos6.x, pos5.y-pos6.y, pos5.z-pos6.z);
 real3 v1b = make_real3(pos7.x-pos6.x, pos7.y-pos6.y, pos7.z-pos6.z);
 real3 v2b = make_real3(pos7.x-pos8.x, pos7.y-pos8.y, pos7.z-pos8.z);
+#if APPLY_PERIODIC
+APPLY_PERIODIC_TO_DELTA(v0b)
+APPLY_PERIODIC_TO_DELTA(v1b)
+APPLY_PERIODIC_TO_DELTA(v2b)
+#endif
 real3 cp0b = cross(v0b, v1b);
 real3 cp1b = cross(v1b, v2b);
 cosangle = dot(normalize(cp0b), normalize(cp1b));

--- a/platforms/cuda/src/kernels/customCentroidBond.cu
+++ b/platforms/cuda/src/kernels/customCentroidBond.cu
@@ -66,8 +66,11 @@ inline __device__ real3 trim(real4 v) {
 /**
 * Compute the difference between two vectors, setting the fourth component to the squared magnitude.
 */
-inline __device__ real4 delta(real4 vec1, real4 vec2) {
+inline __device__ real4 delta(real4 vec1, real4 vec2, bool periodic, real4 periodicBoxSize, real4 invPeriodicBoxSize, 
+        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ) {
    real4 result = make_real4(vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0);
+    if (periodic)
+        APPLY_PERIODIC_TO_DELTA(result);
    result.w = result.x*result.x + result.y*result.y + result.z*result.z;
    return result;
 }
@@ -105,7 +108,7 @@ inline __device__ real4 computeCross(real4 vec1, real4 vec2) {
 * Compute the forces on groups based on the bonds.
 */
 extern "C" __global__ void computeGroupForces(unsigned long long* __restrict__ groupForce, mixed* __restrict__ energyBuffer, const real4* __restrict__ centerPositions,
-        const int* __restrict__ bondGroups
+        const int* __restrict__ bondGroups, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
        EXTRA_ARGS) {
    mixed energy = 0;
    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_BONDS; index += blockDim.x*gridDim.x) {

--- a/platforms/cuda/src/kernels/customCompoundBond.cu
+++ b/platforms/cuda/src/kernels/customCompoundBond.cu
@@ -8,8 +8,11 @@ inline __device__ real3 ccb_trim(real4 v) {
 /**
 * Compute the difference between two vectors, setting the fourth component to the squared magnitude.
 */
-inline __device__ real4 ccb_delta(real4 vec1, real4 vec2) {
+inline __device__ real4 ccb_delta(real4 vec1, real4 vec2, bool periodic, real4 periodicBoxSize, real4 invPeriodicBoxSize, 
+        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ) {
    real4 result = make_real4(vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0);
+    if (periodic)
+        APPLY_PERIODIC_TO_DELTA(result);
    result.w = result.x*result.x + result.y*result.y + result.z*result.z;
    return result;
 }

--- a/platforms/cuda/src/kernels/customGBEnergyN2.cu
+++ b/platforms/cuda/src/kernels/customGBEnergyN2.cu
@@ -168,6 +168,8 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
 #ifdef USE_CUTOFF
    unsigned int numTiles = interactionCount[0];
+    if (numTiles > maxTiles)
+        return; // There wasn't enough memory for the neighbor list.
    int pos = (int) (warp*(numTiles > maxTiles ? NUM_BLOCKS*((long long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
    int end = (int) ((warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
 #else
@@ -191,39 +193,35 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
        int x, y;
        bool singlePeriodicCopy = false;
 #ifdef USE_CUTOFF
-        if (numTiles <= maxTiles) {
            x = tiles[pos];
            real4 blockSizeX = blockSize[x];
            singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= CUTOFF &&
                                  0.5f*periodicBoxSize.y-blockSizeX.y >= CUTOFF &&
                                  0.5f*periodicBoxSize.z-blockSizeX.z >= CUTOFF);
-        }
+#else
-        else
+        y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
-#endif
+        x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-        {
+        if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
-            y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
+            y += (x < y ? -1 : 1);
            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-            if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
+        }
-                y += (x < y ? -1 : 1);
-                x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-            }
-            // Skip over tiles that have exclusions, since they were already processed.
+        // Skip over tiles that have exclusions, since they were already processed.
-            while (skipTiles[tbx+TILE_SIZE-1] < pos) {
+        while (skipTiles[tbx+TILE_SIZE-1] < pos) {
-                if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
+            if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
-                    ushort2 tile = exclusionTiles[skipBase+tgx];
+                ushort2 tile = exclusionTiles[skipBase+tgx];
-                    skipTiles[threadIdx.x] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
+                skipTiles[threadIdx.x] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
-                }
-                else
-                    skipTiles[threadIdx.x] = end;
-                skipBase += TILE_SIZE;            
-                currentSkipIndex = tbx;
            }
-            while (skipTiles[currentSkipIndex] < pos)
+            else
-                currentSkipIndex++;
+                skipTiles[threadIdx.x] = end;
-            includeTile = (skipTiles[currentSkipIndex] != pos);
+            skipBase += TILE_SIZE;            
+            currentSkipIndex = tbx;
        }
+        while (skipTiles[currentSkipIndex] < pos)
+            currentSkipIndex++;
+        includeTile = (skipTiles[currentSkipIndex] != pos);
+#endif
        if (includeTile) {
            unsigned int atom1 = x*TILE_SIZE + tgx;

--- a/platforms/cuda/src/kernels/customGBValueN2.cu
+++ b/platforms/cuda/src/kernels/customGBValueN2.cu
@@ -146,6 +146,8 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
 #ifdef USE_CUTOFF
    unsigned int numTiles = interactionCount[0];
+    if (numTiles > maxTiles)
+        return; // There wasn't enough memory for the neighbor list.
    int pos = (int) (warp*(numTiles > maxTiles ? NUM_BLOCKS*((long long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
    int end = (int) ((warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
 #else
@@ -167,39 +169,35 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
        int x, y;
        bool singlePeriodicCopy = false;
 #ifdef USE_CUTOFF
-        if (numTiles <= maxTiles) {
+        x = tiles[pos];
-            x = tiles[pos];
+        real4 blockSizeX = blockSize[x];
-            real4 blockSizeX = blockSize[x];
+        singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= CUTOFF &&
-            singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= CUTOFF &&
+                              0.5f*periodicBoxSize.y-blockSizeX.y >= CUTOFF &&
-                                  0.5f*periodicBoxSize.y-blockSizeX.y >= CUTOFF &&
+                              0.5f*periodicBoxSize.z-blockSizeX.z >= CUTOFF);
-                                  0.5f*periodicBoxSize.z-blockSizeX.z >= CUTOFF);
+#else
-        }
+        y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
-        else
+        x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-#endif
+        if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
-        {
+            y += (x < y ? -1 : 1);
-            y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-            if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
+        }
-                y += (x < y ? -1 : 1);
-                x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-            }
-            // Skip over tiles that have exclusions, since they were already processed.
+        // Skip over tiles that have exclusions, since they were already processed.
-            while (skipTiles[tbx+TILE_SIZE-1] < pos) {
+        while (skipTiles[tbx+TILE_SIZE-1] < pos) {
-                if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
+            if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
-                    ushort2 tile = exclusionTiles[skipBase+tgx];
+                ushort2 tile = exclusionTiles[skipBase+tgx];
-                    skipTiles[threadIdx.x] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
+                skipTiles[threadIdx.x] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
-                }
-                else
-                    skipTiles[threadIdx.x] = end;
-                skipBase += TILE_SIZE;            
-                currentSkipIndex = tbx;
            }
-            while (skipTiles[currentSkipIndex] < pos)
+            else
-                currentSkipIndex++;
+                skipTiles[threadIdx.x] = end;
-            includeTile = (skipTiles[currentSkipIndex] != pos);
+            skipBase += TILE_SIZE;            
+            currentSkipIndex = tbx;
        }
+        while (skipTiles[currentSkipIndex] < pos)
+            currentSkipIndex++;
+        includeTile = (skipTiles[currentSkipIndex] != pos);
+#endif
        if (includeTile) {
            unsigned int atom1 = x*TILE_SIZE + tgx;

--- a/platforms/cuda/src/kernels/gbsaObc1.cu
+++ b/platforms/cuda/src/kernels/gbsaObc1.cu
@@ -204,6 +204,8 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
 #ifdef USE_CUTOFF
    unsigned int numTiles = interactionCount[0];
+    if (numTiles > maxTiles)
+        return; // There wasn't enough memory for the neighbor list.
    int pos = (int) (warp*(numTiles > maxTiles ? NUM_BLOCKS*((long long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
    int end = (int) ((warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
 #else
@@ -225,39 +227,35 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
        int x, y;
        bool singlePeriodicCopy = false;
 #ifdef USE_CUTOFF
-        if (numTiles <= maxTiles) {
            x = tiles[pos];
            real4 blockSizeX = blockSize[x];
            singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= CUTOFF &&
                                  0.5f*periodicBoxSize.y-blockSizeX.y >= CUTOFF &&
                                  0.5f*periodicBoxSize.z-blockSizeX.z >= CUTOFF);
-        }
+#else
-        else
+        y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
-#endif
+        x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-        {
+        if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
-            y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
+            y += (x < y ? -1 : 1);
            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-            if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
+        }
-                y += (x < y ? -1 : 1);
-                x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-            }
-            // Skip over tiles that have exclusions, since they were already processed.
+        // Skip over tiles that have exclusions, since they were already processed.
-            while (skipTiles[tbx+TILE_SIZE-1] < pos) {
+        while (skipTiles[tbx+TILE_SIZE-1] < pos) {
-                if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
+            if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
-                    ushort2 tile = exclusionTiles[skipBase+tgx];
+                ushort2 tile = exclusionTiles[skipBase+tgx];
-                    skipTiles[threadIdx.x] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
+                skipTiles[threadIdx.x] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
-                }
-                else
-                    skipTiles[threadIdx.x] = end;
-                skipBase += TILE_SIZE;            
-                currentSkipIndex = tbx;
            }
-            while (skipTiles[currentSkipIndex] < pos)
+            else
-                currentSkipIndex++;
+                skipTiles[threadIdx.x] = end;
-            includeTile = (skipTiles[currentSkipIndex] != pos);
+            skipBase += TILE_SIZE;            
+            currentSkipIndex = tbx;
        }
+        while (skipTiles[currentSkipIndex] < pos)
+            currentSkipIndex++;
+        includeTile = (skipTiles[currentSkipIndex] != pos);
+#endif
        if (includeTile) {
            unsigned int atom1 = x*TILE_SIZE + tgx;
@@ -559,6 +557,8 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
 #ifdef USE_CUTOFF
    unsigned int numTiles = interactionCount[0];
+    if (numTiles > maxTiles)
+        return; // There wasn't enough memory for the neighbor list.
    int pos = (int) (warp*(numTiles > maxTiles ? NUM_BLOCKS*((long long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
    int end = (int) ((warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
 #else
@@ -580,39 +580,35 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
        int x, y;
        bool singlePeriodicCopy = false;
 #ifdef USE_CUTOFF
-        if (numTiles <= maxTiles) {
+        x = tiles[pos];
-            x = tiles[pos];
+        real4 blockSizeX = blockSize[x];
-            real4 blockSizeX = blockSize[x];
+        singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= CUTOFF &&
-            singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= CUTOFF &&
+                              0.5f*periodicBoxSize.y-blockSizeX.y >= CUTOFF &&
-                                  0.5f*periodicBoxSize.y-blockSizeX.y >= CUTOFF &&
+                              0.5f*periodicBoxSize.z-blockSizeX.z >= CUTOFF);
-                                  0.5f*periodicBoxSize.z-blockSizeX.z >= CUTOFF);
+#else
-        }
+        y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
-        else
+        x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-#endif
+        if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
-        {
+            y += (x < y ? -1 : 1);
-            y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-            if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
+        }
-                y += (x < y ? -1 : 1);
-                x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-            }
-            // Skip over tiles that have exclusions, since they were already processed.
+        // Skip over tiles that have exclusions, since they were already processed.
-            while (skipTiles[tbx+TILE_SIZE-1] < pos) {
+        while (skipTiles[tbx+TILE_SIZE-1] < pos) {
-                if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
+            if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
-                    ushort2 tile = exclusionTiles[skipBase+tgx];
+                ushort2 tile = exclusionTiles[skipBase+tgx];
-                    skipTiles[threadIdx.x] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
+                skipTiles[threadIdx.x] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
-                }
-                else
-                    skipTiles[threadIdx.x] = end;
-                skipBase += TILE_SIZE;            
-                currentSkipIndex = tbx;
            }
-            while (skipTiles[currentSkipIndex] < pos)
+            else
-                currentSkipIndex++;
+                skipTiles[threadIdx.x] = end;
-            includeTile = (skipTiles[currentSkipIndex] != pos);
+            skipBase += TILE_SIZE;            
+            currentSkipIndex = tbx;
        }
+        while (skipTiles[currentSkipIndex] < pos)
+            currentSkipIndex++;
+        includeTile = (skipTiles[currentSkipIndex] != pos);
+#endif
        if (includeTile) {
            unsigned int atom1 = x*TILE_SIZE + tgx;

--- a/platforms/cuda/src/kernels/nonbonded.cu
+++ b/platforms/cuda/src/kernels/nonbonded.cu
@@ -117,7 +117,9 @@ extern "C" __global__ void computeNonbonded(
 #ifndef ENABLE_SHUFFLE
    __shared__ AtomData localData[THREAD_BLOCK_SIZE];
 #endif
    // First loop: process tiles that contain exclusions.
    const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
    const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
    for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
@@ -309,8 +311,11 @@ extern "C" __global__ void computeNonbonded(
    // Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
    // of them (no cutoff).
 #ifdef USE_CUTOFF
    const unsigned int numTiles = interactionCount[0];
+    if (numTiles > maxTiles)
+        return; // There wasn't enough memory for the neighbor list.
    int pos = (int) (numTiles > maxTiles ? startTileIndex+warp*(long long)numTileIndices/totalWarps : warp*(long long)numTiles/totalWarps);
    int end = (int) (numTiles > maxTiles ? startTileIndex+(warp+1)*(long long)numTileIndices/totalWarps : (warp+1)*(long long)numTiles/totalWarps);
 #else
@@ -335,39 +340,35 @@ extern "C" __global__ void computeNonbonded(
        int x, y;
        bool singlePeriodicCopy = false;
 #ifdef USE_CUTOFF
-        if (numTiles <= maxTiles) {
+        x = tiles[pos];
-            x = tiles[pos];
+        real4 blockSizeX = blockSize[x];
-            real4 blockSizeX = blockSize[x];
+        singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= MAX_CUTOFF &&
-            singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= MAX_CUTOFF &&
+                              0.5f*periodicBoxSize.y-blockSizeX.y >= MAX_CUTOFF &&
-                                  0.5f*periodicBoxSize.y-blockSizeX.y >= MAX_CUTOFF &&
+                              0.5f*periodicBoxSize.z-blockSizeX.z >= MAX_CUTOFF);
-                                  0.5f*periodicBoxSize.z-blockSizeX.z >= MAX_CUTOFF);
+#else
-        }
+        y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
-        else
+        x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-#endif
+        if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
-        {
+            y += (x < y ? -1 : 1);
-            y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-            if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
+        }
-                y += (x < y ? -1 : 1);
-                x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-            }
-            // Skip over tiles that have exclusions, since they were already processed.
+        // Skip over tiles that have exclusions, since they were already processed.
-            while (skipTiles[tbx+TILE_SIZE-1] < pos) {
+        while (skipTiles[tbx+TILE_SIZE-1] < pos) {
-                if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
+            if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
-                    ushort2 tile = exclusionTiles[skipBase+tgx];
+                ushort2 tile = exclusionTiles[skipBase+tgx];
-                    skipTiles[threadIdx.x] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
+                skipTiles[threadIdx.x] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
-                }
-                else
-                    skipTiles[threadIdx.x] = end;
-                skipBase += TILE_SIZE;            
-                currentSkipIndex = tbx;
            }
-            while (skipTiles[currentSkipIndex] < pos)
+            else
-                currentSkipIndex++;
+                skipTiles[threadIdx.x] = end;
-            includeTile = (skipTiles[currentSkipIndex] != pos);
+            skipBase += TILE_SIZE;            
+            currentSkipIndex = tbx;
        }
+        while (skipTiles[currentSkipIndex] < pos)
+            currentSkipIndex++;
+        includeTile = (skipTiles[currentSkipIndex] != pos);
+#endif
        if (includeTile) {
            unsigned int atom1 = x*TILE_SIZE + tgx;
            // Load atom data for this tile.

--- a/platforms/cuda/src/kernels/pme.cu
+++ b/platforms/cuda/src/kernels/pme.cu
 extern "C" __global__ void findAtomGridIndex(const real4* __restrict__ posq, int2* __restrict__ pmeAtomGridIndex,
-            real4 periodicBoxSize, real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) {
+            real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
+            real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) {
    // Compute the index of the grid point each atom is associated with.
    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
        real4 pos = posq[i];
+        APPLY_PERIODIC_TO_POS(pos)
        real3 t = make_real3(pos.x*recipBoxVecX.x+pos.y*recipBoxVecY.x+pos.z*recipBoxVecZ.x,
                             pos.y*recipBoxVecY.y+pos.z*recipBoxVecZ.y,
                             pos.z*recipBoxVecZ.z);
@@ -18,7 +20,8 @@ extern "C" __global__ void findAtomGridIndex(const real4* __restrict__ posq, int
 }
 extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, real* __restrict__ originalPmeGrid,
-        real4 periodicBoxSize, real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ, const int2* __restrict__ pmeAtomGridIndex) {
+        real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
+        real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ, const int2* __restrict__ pmeAtomGridIndex) {
    real3 data[PME_ORDER];
    const real scale = RECIP(PME_ORDER-1);
@@ -28,9 +31,7 @@ extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, real
    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
        int atom = pmeAtomGridIndex[i].x;
        real4 pos = posq[atom];
-        pos.x -= floor(pos.x*recipBoxVecX.x)*periodicBoxSize.x;
+        APPLY_PERIODIC_TO_POS(pos)
-        pos.y -= floor(pos.y*recipBoxVecY.y)*periodicBoxSize.y;
-        pos.z -= floor(pos.z*recipBoxVecZ.z)*periodicBoxSize.z;
        real3 t = make_real3(pos.x*recipBoxVecX.x+pos.y*recipBoxVecY.x+pos.z*recipBoxVecZ.x,
                             pos.y*recipBoxVecY.y+pos.z*recipBoxVecZ.y,
                             pos.z*recipBoxVecZ.z);
@@ -83,7 +84,7 @@ extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, real
 #ifdef USE_DOUBLE_PRECISION
                    unsigned long long * ulonglong_p = (unsigned long long *) originalPmeGrid;
                    atomicAdd(&ulonglong_p[index],  static_cast<unsigned long long>((long long) (add*0x100000000)));
-#elif __CUDA_ARCH__ < 200
+#elif __CUDA_ARCH__ < 200 || defined(USE_DETERMINISTIC_FORCES)
                    unsigned long long * ulonglong_p = (unsigned long long *) originalPmeGrid;
                    int gridIndex = index;
                    gridIndex = (gridIndex%2 == 0 ? gridIndex/2 : (gridIndex+GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z)/2);
@@ -197,7 +198,8 @@ gridEvaluateEnergy(real2* __restrict__ halfcomplex_pmeGrid, mixed* __restrict__
 extern "C" __global__
 void gridInterpolateForce(const real4* __restrict__ posq, unsigned long long* __restrict__ forceBuffers, const real* __restrict__ originalPmeGrid,
-        real4 periodicBoxSize, real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ, const int2* __restrict__ pmeAtomGridIndex) {
+        real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
+        real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ, const int2* __restrict__ pmeAtomGridIndex) {
    real3 data[PME_ORDER];
    real3 ddata[PME_ORDER];
    const real scale = RECIP(PME_ORDER-1);
@@ -209,9 +211,7 @@ void gridInterpolateForce(const real4* __restrict__ posq, unsigned long long* __
        int atom = pmeAtomGridIndex[i].x;
        real3 force = make_real3(0);
        real4 pos = posq[atom];
-        pos.x -= floor(pos.x*recipBoxVecX.x)*periodicBoxSize.x;
+        APPLY_PERIODIC_TO_POS(pos)
-        pos.y -= floor(pos.y*recipBoxVecY.y)*periodicBoxSize.y;
-        pos.z -= floor(pos.z*recipBoxVecZ.z)*periodicBoxSize.z;
        real3 t = make_real3(pos.x*recipBoxVecX.x+pos.y*recipBoxVecY.x+pos.z*recipBoxVecZ.x,
                             pos.y*recipBoxVecY.y+pos.z*recipBoxVecZ.y,
                             pos.z*recipBoxVecZ.z);

--- a/platforms/cuda/src/kernels/torsionForce.cu
+++ b/platforms/cuda/src/kernels/torsionForce.cu
@@ -2,6 +2,11 @@ const real PI = (real) 3.14159265358979323846;
 real3 v0 = make_real3(pos1.x-pos2.x, pos1.y-pos2.y, pos1.z-pos2.z);
 real3 v1 = make_real3(pos3.x-pos2.x, pos3.y-pos2.y, pos3.z-pos2.z);
 real3 v2 = make_real3(pos3.x-pos4.x, pos3.y-pos4.y, pos3.z-pos4.z);
+#if APPLY_PERIODIC
+APPLY_PERIODIC_TO_DELTA(v0)
+APPLY_PERIODIC_TO_DELTA(v1)
+APPLY_PERIODIC_TO_DELTA(v2)
+#endif
 real3 cp0 = cross(v0, v1);
 real3 cp1 = cross(v1, v2);
 real cosangle = dot(normalize(cp0), normalize(cp1));

--- a/platforms/cuda/tests/CudaTests.h
+++ b/platforms/cuda/tests/CudaTests.h
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2015 Stanford University and the Authors.           *
+ * Portions copyright (c) 2015-2016 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -39,5 +39,5 @@ OpenMM::CudaPlatform platform;
 void initializeTests(int argc, char* argv[]) {
    if (argc > 1)
-        platform.setPropertyDefaultValue("CudaPrecision", std::string(argv[1]));
+        platform.setPropertyDefaultValue("Precision", std::string(argv[1]));
 }
--- a/platforms/cuda/tests/TestCudaFFT3D.cpp
+++ b/platforms/cuda/tests/TestCudaFFT3D.cpp
@@ -56,7 +56,7 @@ void testTransform(bool realToComplex, int xsize, int ysize, int zsize) {
    system.addParticle(0.0);
    CudaPlatform::PlatformData platformData(NULL, system, "", "true", platform.getPropertyDefaultValue("CudaPrecision"), "false",
            platform.getPropertyDefaultValue(CudaPlatform::CudaCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaTempDirectory()),
-            platform.getPropertyDefaultValue(CudaPlatform::CudaHostCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaDisablePmeStream()), 1);
+            platform.getPropertyDefaultValue(CudaPlatform::CudaHostCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaDisablePmeStream()), "false", 1);
    CudaContext& context = *platformData.contexts[0];
    context.initialize();
    OpenMM_SFMT::SFMT sfmt;

--- a/platforms/cuda/tests/TestCudaNonbondedForce.cpp
+++ b/platforms/cuda/tests/TestCudaNonbondedForce.cpp
@@ -118,9 +118,44 @@ void testReordering() {
    }
 }
+void testDeterministicForces() {
+    // Check that the CudaDeterministicForces property works correctly.
+    const int numParticles = 1000;
+    System system;
+    system.setDefaultPeriodicBoxVectors(Vec3(6, 0, 0), Vec3(2.1, 6, 0), Vec3(-1.5, -0.5, 6));
+    NonbondedForce *nonbonded = new NonbondedForce();
+    nonbonded->setNonbondedMethod(NonbondedForce::PME);
+    system.addForce(nonbonded);
+    vector<Vec3> positions;
+    OpenMM_SFMT::SFMT sfmt;
+    init_gen_rand(0, sfmt);
+    for (int i = 0; i < numParticles; i++) {
+        system.addParticle(1.0);
+        nonbonded->addParticle(i%2 == 0 ? 1 : -1, 1, 0);
+        positions.push_back(Vec3(genrand_real2(sfmt)-0.5, genrand_real2(sfmt)-0.5, genrand_real2(sfmt)-0.5)*6);
+    }
+    VerletIntegrator integrator(0.001);
+    map<string, string> properties;
+    properties[CudaPlatform::CudaDeterministicForces()] = "true";
+    Context context(system, integrator, platform, properties);
+    context.setPositions(positions);
+    State state1 = context.getState(State::Forces);
+    State state2 = context.getState(State::Forces);
+    // All forces should be *exactly* equal.
+    for (int i = 0; i < numParticles; i++) {
+        ASSERT_EQUAL(state1.getForces()[i][0], state2.getForces()[i][0]);
+        ASSERT_EQUAL(state1.getForces()[i][1], state2.getForces()[i][1]);
+        ASSERT_EQUAL(state1.getForces()[i][2], state2.getForces()[i][2]);
+    }
+}
 void runPlatformTests() {
    testParallelComputation(NonbondedForce::NoCutoff);
    testParallelComputation(NonbondedForce::Ewald);
    testParallelComputation(NonbondedForce::PME);
    testReordering();
+    testDeterministicForces();
 }
--- a/platforms/cuda/tests/TestCudaRandom.cpp
+++ b/platforms/cuda/tests/TestCudaRandom.cpp
@@ -56,7 +56,7 @@ void testGaussian() {
        system.addParticle(1.0);
    CudaPlatform::PlatformData platformData(NULL, system, "", "true", platform.getPropertyDefaultValue("CudaPrecision"), "false",
            platform.getPropertyDefaultValue(CudaPlatform::CudaCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaTempDirectory()),
-            platform.getPropertyDefaultValue(CudaPlatform::CudaHostCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaDisablePmeStream()), 1);
+            platform.getPropertyDefaultValue(CudaPlatform::CudaHostCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaDisablePmeStream()), "false", 1);
    CudaContext& context = *platformData.contexts[0];
    context.initialize();
    context.getIntegrationUtilities().initRandomNumberGenerator(0);