Continuing to implement new CUDA platform: NonbondedForce

86aacbd8 · Peter Eastman · eb64fa2f · 86aacbd8 · 86aacbd8 · 86aacbd8
Commit 86aacbd8 authored Jun 15, 2012 by Peter Eastman
20 changed files
--- a/platforms/cuda2/sharedTarget/CMakeLists.txt
+++ b/platforms/cuda2/sharedTarget/CMakeLists.txt
@@ -18,7 +18,7 @@ IF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
 ELSE (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
    SET(MAIN_OPENMM_LIB ${OPENMM_LIBRARY_NAME})
 ENDIF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
-TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${MAIN_OPENMM_LIB} ${CUDA_LIBRARIES} ${PTHREADS_LIB})
+TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${MAIN_OPENMM_LIB} ${CUDA_LIBRARIES} ${CUDA_CUFFT_LIBRARIES} ${PTHREADS_LIB})
 SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES COMPILE_FLAGS "-DOPENMM_BUILDING_SHARED_LIBRARY")
 INSTALL_TARGETS(/lib/plugins RUNTIME_DIRECTORY /lib/plugins ${SHARED_TARGET})
--- a/platforms/cuda2/src/CudaBondedUtilities.cpp
+++ b/platforms/cuda2/src/CudaBondedUtilities.cpp
@@ -66,7 +66,8 @@ void CudaBondedUtilities::addPrefixCode(const string& source) {
 void CudaBondedUtilities::initialize(const System& system) {
    int numForces = forceAtoms.size();
-    if (numForces == 0)
+    hasInteractions = (numForces > 0);
+    if (!hasInteractions)
        return;
    // Build the lists of atom indices.
@@ -164,6 +165,8 @@ void CudaBondedUtilities::computeInteractions(int groups) {
        for (int i = 0; i < (int) arguments.size(); i++)
            kernelArgs.push_back(&arguments[i]);
    }
+    if (!hasInteractions)
+        return;
    kernelArgs[3] = &groups;
    context.executeKernel(kernel, &kernelArgs[0], maxBonds);
 }
--- a/platforms/cuda2/src/CudaBondedUtilities.h
+++ b/platforms/cuda2/src/CudaBondedUtilities.h
@@ -131,7 +131,7 @@ private:
    std::vector<std::string> prefixCode;
    std::vector<void*> kernelArgs;
    int numForceBuffers, maxBonds;
-    bool hasInitializedKernels;
+    bool hasInitializedKernels, hasInteractions;
 };
 } // namespace OpenMM

--- a/platforms/cuda2/src/CudaContext.cpp
+++ b/platforms/cuda2/src/CudaContext.cpp
@@ -34,7 +34,7 @@
 #include "CudaForceInfo.h"
 #include "CudaIntegrationUtilities.h"
 #include "CudaKernelSources.h"
-//#include "CudaNonbondedUtilities.h"
+#include "CudaNonbondedUtilities.h"
 #include "hilbert.h"
 #include "openmm/OpenMMException.h"
 #include "openmm/Platform.h"
@@ -68,7 +68,7 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
        const string& tempDir, CudaPlatform::PlatformData& platformData) : system(system), compiler(compiler),
        time(0.0), platformData(platformData), stepCount(0), computeForceCount(0), contextIsValid(false), atomsWereReordered(false), pinnedBuffer(NULL), posq(NULL),
        velm(NULL), force(NULL), energyBuffer(NULL), atomIndex(NULL), integration(NULL), expression(NULL),
-        bonded(NULL), /*nonbonded(NULL),*/ thread(NULL) {
+        bonded(NULL), nonbonded(NULL), thread(NULL) {
    if (!hasInitializedCuda) {
        CHECK_RESULT2(cuInit(0), "Error initializing CUDA");
        hasInitializedCuda = true;
@@ -122,7 +122,6 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
        throw OpenMMException("No compatible CUDA device is available");
    CHECK_RESULT(cuDeviceGet(&device, deviceIndex));
    this->deviceIndex = deviceIndex;
-    compilationDefines["WORK_GROUP_SIZE"] = intToString(ThreadBlockSize);
    defaultOptimizationOptions = "--use_fast_math";
    unsigned int flags = CU_CTX_MAP_HOST;
    if (useBlockingSync)
@@ -139,13 +138,18 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
    int numThreadBlocksPerComputeUnit = 6;
    numThreadBlocks = numThreadBlocksPerComputeUnit*multiprocessors;
    bonded = new CudaBondedUtilities(*this);
-//    nonbonded = new CudaNonbondedUtilities(*this);
+    nonbonded = new CudaNonbondedUtilities(*this);
+    int numEnergyBuffers = max(numThreadBlocks*ThreadBlockSize, nonbonded->getNumEnergyBuffers());
    if (useDoublePrecision) {
        posq = CudaArray::create<double4>(paddedNumAtoms, "posq");
        velm = CudaArray::create<double4>(paddedNumAtoms, "velm");
+        compilationDefines["USE_DOUBLE_PRECISION"] = "1";
        compilationDefines["make_real2"] = "make_double2";
        compilationDefines["make_real3"] = "make_double3";
        compilationDefines["make_real4"] = "make_double4";
+        energyBuffer = CudaArray::create<double>(numEnergyBuffers, "energyBuffer");
+        int pinnedBufferSize = max(paddedNumAtoms*4, numEnergyBuffers);
+        CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), 0));
    }
    else {
        posq = CudaArray::create<float4>(paddedNumAtoms, "posq");
@@ -153,6 +157,9 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
        compilationDefines["make_real2"] = "make_float2";
        compilationDefines["make_real3"] = "make_float3";
        compilationDefines["make_real4"] = "make_float4";
+        energyBuffer = CudaArray::create<float>(numEnergyBuffers, "energyBuffer");
+        int pinnedBufferSize = max(paddedNumAtoms*6, numEnergyBuffers);
+        CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(float), 0));
    }
    posCellOffsets.resize(paddedNumAtoms, make_int4(0, 0, 0, 0));
@@ -191,6 +198,7 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
 }
 CudaContext::~CudaContext() {
+    cuCtxSetCurrent(context);
    for (int i = 0; i < (int) forces.size(); i++)
        delete forces[i];
    for (int i = 0; i < (int) reorderListeners.size(); i++)
@@ -211,8 +219,8 @@ CudaContext::~CudaContext() {
        delete expression;
    if (bonded != NULL)
        delete bonded;
-//    if (nonbonded != NULL)
+    if (nonbonded != NULL)
-//        delete nonbonded;
+        delete nonbonded;
    if (thread != NULL)
        delete thread;
    string errorMessage = "Error deleting Context";
@@ -221,17 +229,8 @@ CudaContext::~CudaContext() {
 }
 void CudaContext::initialize() {
+    cuCtxSetCurrent(context);
    string errorMessage = "Error initializing Context";
-    if (useDoublePrecision) {
-        energyBuffer = CudaArray::create<double>(numThreadBlocks*ThreadBlockSize, "energyBuffer");
-        int pinnedBufferSize = max(paddedNumAtoms*4, numThreadBlocks*ThreadBlockSize);
-        CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), 0));
-    }
-    else {
-        energyBuffer = CudaArray::create<float>(numThreadBlocks*ThreadBlockSize, "energyBuffer");
-        int pinnedBufferSize = max(paddedNumAtoms*6, numThreadBlocks*ThreadBlockSize);
-        CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(float), 0));
-    }
    for (int i = 0; i < numAtoms; i++) {
        double mass = system.getParticleMass(i);
        if (useDoublePrecision)
@@ -251,7 +250,7 @@ void CudaContext::initialize() {
    atomIndexDevice->upload(atomIndex);
    findMoleculeGroups();
    moleculesInvalid = false;
-//    nonbonded->initialize(system);
+    nonbonded->initialize(system);
 }
 void CudaContext::addForce(CudaForceInfo* force) {
@@ -719,226 +718,226 @@ void CudaContext::invalidateMolecules() {
    moleculesInvalid = true;
 }
-//void OpenCLContext::validateMolecules() {
+void CudaContext::validateMolecules() {
-//    moleculesInvalid = false;
+    moleculesInvalid = false;
-//    if (numAtoms == 0 || nonbonded == NULL || !nonbonded->getUseCutoff())
+    if (numAtoms == 0 || nonbonded == NULL || !nonbonded->getUseCutoff())
-//        return;
+        return;
-//    bool valid = true;
+    bool valid = true;
-//    for (int group = 0; valid && group < (int) moleculeGroups.size(); group++) {
+    for (int group = 0; valid && group < (int) moleculeGroups.size(); group++) {
-//        MoleculeGroup& mol = moleculeGroups[group];
+        MoleculeGroup& mol = moleculeGroups[group];
-//        vector<int>& instances = mol.instances;
+        vector<int>& instances = mol.instances;
-//        vector<int>& offsets = mol.offsets;
+        vector<int>& offsets = mol.offsets;
-//        vector<int>& atoms = mol.atoms;
+        vector<int>& atoms = mol.atoms;
-//        int numMolecules = instances.size();
+        int numMolecules = instances.size();
-//        Molecule& m1 = molecules[instances[0]];
+        Molecule& m1 = molecules[instances[0]];
-//        int offset1 = offsets[0];
+        int offset1 = offsets[0];
-//        for (int j = 1; valid && j < numMolecules; j++) {
+        for (int j = 1; valid && j < numMolecules; j++) {
-//            // See if the atoms are identical.
+            // See if the atoms are identical.
-//
-//            Molecule& m2 = molecules[instances[j]];
+            Molecule& m2 = molecules[instances[j]];
-//            int offset2 = offsets[j];
+            int offset2 = offsets[j];
-//            for (int i = 0; i < (int) atoms.size() && valid; i++) {
+            for (int i = 0; i < (int) atoms.size() && valid; i++) {
-//                for (int k = 0; k < (int) forces.size(); k++)
+                for (int k = 0; k < (int) forces.size(); k++)
-//                    if (!forces[k]->areParticlesIdentical(atoms[i]+offset1, atoms[i]+offset2))
+                    if (!forces[k]->areParticlesIdentical(atoms[i]+offset1, atoms[i]+offset2))
-//                        valid = false;
+                        valid = false;
-//            }
+            }
-//
-//            // See if the force groups are identical.
+            // See if the force groups are identical.
-//
-//            for (int i = 0; i < (int) forces.size() && valid; i++) {
+            for (int i = 0; i < (int) forces.size() && valid; i++) {
-//                for (int k = 0; k < (int) m1.groups[i].size() && valid; k++)
+                for (int k = 0; k < (int) m1.groups[i].size() && valid; k++)
-//                    if (!forces[i]->areGroupsIdentical(m1.groups[i][k], m2.groups[i][k]))
+                    if (!forces[i]->areGroupsIdentical(m1.groups[i][k], m2.groups[i][k]))
-//                        valid = false;
+                        valid = false;
-//            }
+            }
-//        }
+        }
-//    }
+    }
-//    if (valid)
+    if (valid)
-//        return;
+        return;
-//    
-//    // The list of which molecules are identical is no longer valid.  We need to restore the
+    // The list of which molecules are identical is no longer valid.  We need to restore the
-//    // atoms to their original order, rebuild the list of identical molecules, and sort them
+    // atoms to their original order, rebuild the list of identical molecules, and sort them
-//    // again.
+    // again.
-//    
-//    vector<mm_float4> newPosq(numAtoms);
+    vector<float4> oldPosq(paddedNumAtoms);
-//    vector<mm_float4> newVelm(numAtoms);
+    vector<float4> newPosq(paddedNumAtoms);
-//    vector<mm_int4> newCellOffsets(numAtoms);
+    vector<float4> oldVelm(paddedNumAtoms);
-//    posq->download();
+    vector<float4> newVelm(paddedNumAtoms);
-//    velm->download();
+    vector<int4> newCellOffsets(numAtoms);
-//    for (int i = 0; i < numAtoms; i++) {
+    posq->download(oldPosq);
-//        int index = atomIndex->get(i);
+    velm->download(oldVelm);
-//        newPosq[index] = posq->get(i);
+    for (int i = 0; i < numAtoms; i++) {
-//        newVelm[index] = velm->get(i);
+        int index = atomIndex[i];
-//        newCellOffsets[index] = posCellOffsets[i];
+        newPosq[index] = oldPosq[i];
-//    }
+        newVelm[index] = oldVelm[i];
-//    for (int i = 0; i < numAtoms; i++) {
+        newCellOffsets[index] = posCellOffsets[i];
-//        posq->set(i, newPosq[i]);
+    }
-//        velm->set(i, newVelm[i]);
+    for (int i = 0; i < numAtoms; i++) {
-//        atomIndex->set(i, i);
+        atomIndex[i] = i;
-//        posCellOffsets[i] = newCellOffsets[i];
+        posCellOffsets[i] = newCellOffsets[i];
-//    }
+    }
-//    posq->upload();
+    posq->upload(newPosq);
-//    velm->upload();
+    velm->upload(newVelm);
-//    atomIndex->upload();
+    atomIndexDevice->upload(atomIndex);
-//    findMoleculeGroups();
+    findMoleculeGroups();
-//    for (int i = 0; i < (int) reorderListeners.size(); i++)
+    for (int i = 0; i < (int) reorderListeners.size(); i++)
-//        reorderListeners[i]->execute();
+        reorderListeners[i]->execute();
-//}
+}
-//
-//void OpenCLContext::reorderAtoms(bool enforcePeriodic) {
+void CudaContext::reorderAtoms(bool enforcePeriodic) {
-//    if (numAtoms == 0 || nonbonded == NULL || !nonbonded->getUseCutoff())
+    if (numAtoms == 0 || nonbonded == NULL || !nonbonded->getUseCutoff())
-//        return;
+        return;
-//    if (moleculesInvalid)
+    if (moleculesInvalid)
-//        validateMolecules();
+        validateMolecules();
-//    atomsWereReordered = true;
+    atomsWereReordered = true;
-//
-//    // Find the range of positions and the number of bins along each axis.
+    // Find the range of positions and the number of bins along each axis.
-//
-//    posq->download();
+    vector<float4> oldPosq(paddedNumAtoms);
-//    velm->download();
+    vector<float4> oldVelm(paddedNumAtoms);
-//    float minx = posq->get(0).x, maxx = posq->get(0).x;
+    posq->download(oldPosq);
-//    float miny = posq->get(0).y, maxy = posq->get(0).y;
+    velm->download(oldVelm);
-//    float minz = posq->get(0).z, maxz = posq->get(0).z;
+    float minx = oldPosq[0].x, maxx = oldPosq[0].x;
-//    if (nonbonded->getUsePeriodic()) {
+    float miny = oldPosq[0].y, maxy = oldPosq[0].y;
-//        minx = miny = minz = 0.0;
+    float minz = oldPosq[0].z, maxz = oldPosq[0].z;
-//        maxx = periodicBoxSize.x;
+    if (nonbonded->getUsePeriodic()) {
-//        maxy = periodicBoxSize.y;
+        minx = miny = minz = 0.0;
-//        maxz = periodicBoxSize.z;
+        maxx = periodicBoxSize.x;
-//    }
+        maxy = periodicBoxSize.y;
-//    else {
+        maxz = periodicBoxSize.z;
-//        for (int i = 1; i < numAtoms; i++) {
+    }
-//            const mm_float4& pos = posq->get(i);
+    else {
-//            minx = min(minx, pos.x);
+        for (int i = 1; i < numAtoms; i++) {
-//            maxx = max(maxx, pos.x);
+            const float4& pos = oldPosq[i];
-//            miny = min(miny, pos.y);
+            minx = min(minx, pos.x);
-//            maxy = max(maxy, pos.y);
+            maxx = max(maxx, pos.x);
-//            minz = min(minz, pos.z);
+            miny = min(miny, pos.y);
-//            maxz = max(maxz, pos.z);
+            maxy = max(maxy, pos.y);
-//        }
+            minz = min(minz, pos.z);
-//    }
+            maxz = max(maxz, pos.z);
-//
+        }
-//    // Loop over each group of identical molecules and reorder them.
+    }
-//
-//    vector<int> originalIndex(numAtoms);
+    // Loop over each group of identical molecules and reorder them.
-//    vector<mm_float4> newPosq(numAtoms);
-//    vector<mm_float4> newVelm(numAtoms);
+    vector<int> originalIndex(numAtoms);
-//    vector<mm_int4> newCellOffsets(numAtoms);
+    vector<float4> newPosq(paddedNumAtoms);
-//    for (int group = 0; group < (int) moleculeGroups.size(); group++) {
+    vector<float4> newVelm(paddedNumAtoms);
-//        // Find the center of each molecule.
+    vector<int4> newCellOffsets(numAtoms);
-//
+    for (int group = 0; group < (int) moleculeGroups.size(); group++) {
-//        MoleculeGroup& mol = moleculeGroups[group];
+        // Find the center of each molecule.
-//        int numMolecules = mol.offsets.size();
-//        vector<int>& atoms = mol.atoms;
+        MoleculeGroup& mol = moleculeGroups[group];
-//        vector<mm_float4> molPos(numMolecules);
+        int numMolecules = mol.offsets.size();
-//        float invNumAtoms = 1.0f/atoms.size();
+        vector<int>& atoms = mol.atoms;
-//        for (int i = 0; i < numMolecules; i++) {
+        vector<float4> molPos(numMolecules);
-//            molPos[i].x = 0.0f;
+        float invNumAtoms = 1.0f/atoms.size();
-//            molPos[i].y = 0.0f;
+        for (int i = 0; i < numMolecules; i++) {
-//            molPos[i].z = 0.0f;
+            molPos[i].x = 0.0f;
-//            for (int j = 0; j < (int)atoms.size(); j++) {
+            molPos[i].y = 0.0f;
-//                int atom = atoms[j]+mol.offsets[i];
+            molPos[i].z = 0.0f;
-//                const mm_float4& pos = posq->get(atom);
+            for (int j = 0; j < (int)atoms.size(); j++) {
-//                molPos[i].x += pos.x;
+                int atom = atoms[j]+mol.offsets[i];
-//                molPos[i].y += pos.y;
+                const float4& pos = oldPosq[atom];
-//                molPos[i].z += pos.z;
+                molPos[i].x += pos.x;
-//            }
+                molPos[i].y += pos.y;
-//            molPos[i].x *= invNumAtoms;
+                molPos[i].z += pos.z;
-//            molPos[i].y *= invNumAtoms;
+            }
-//            molPos[i].z *= invNumAtoms;
+            molPos[i].x *= invNumAtoms;
-//        }
+            molPos[i].y *= invNumAtoms;
-//        if (nonbonded->getUsePeriodic()) {
+            molPos[i].z *= invNumAtoms;
-//            // Move each molecule position into the same box.
+        }
-//
+        if (nonbonded->getUsePeriodic()) {
-//            for (int i = 0; i < numMolecules; i++) {
+            // Move each molecule position into the same box.
-//                int xcell = (int) floor(molPos[i].x*invPeriodicBoxSize.x);
-//                int ycell = (int) floor(molPos[i].y*invPeriodicBoxSize.y);
+            for (int i = 0; i < numMolecules; i++) {
-//                int zcell = (int) floor(molPos[i].z*invPeriodicBoxSize.z);
+                int xcell = (int) floor(molPos[i].x*invPeriodicBoxSize.x);
-//                float dx = xcell*periodicBoxSize.x;
+                int ycell = (int) floor(molPos[i].y*invPeriodicBoxSize.y);
-//                float dy = ycell*periodicBoxSize.y;
+                int zcell = (int) floor(molPos[i].z*invPeriodicBoxSize.z);
-//                float dz = zcell*periodicBoxSize.z;
+                float dx = xcell*periodicBoxSize.x;
-//                if (dx != 0.0f || dy != 0.0f || dz != 0.0f) {
+                float dy = ycell*periodicBoxSize.y;
-//                    molPos[i].x -= dx;
+                float dz = zcell*periodicBoxSize.z;
-//                    molPos[i].y -= dy;
+                if (dx != 0.0f || dy != 0.0f || dz != 0.0f) {
-//                    molPos[i].z -= dz;
+                    molPos[i].x -= dx;
-//                    if (enforcePeriodic) {
+                    molPos[i].y -= dy;
-//                        for (int j = 0; j < (int) atoms.size(); j++) {
+                    molPos[i].z -= dz;
-//                            int atom = atoms[j]+mol.offsets[i];
+                    if (enforcePeriodic) {
-//                            mm_float4 p = posq->get(atom);
+                        for (int j = 0; j < (int) atoms.size(); j++) {
-//                            p.x -= dx;
+                            int atom = atoms[j]+mol.offsets[i];
-//                            p.y -= dy;
+                            float4 p = oldPosq[atom];
-//                            p.z -= dz;
+                            p.x -= dx;
-//                            posq->set(atom, p);
+                            p.y -= dy;
-//                            posCellOffsets[atom].x -= xcell;
+                            p.z -= dz;
-//                            posCellOffsets[atom].y -= ycell;
+                            oldPosq[atom] = p;
-//                            posCellOffsets[atom].z -= zcell;
+                            posCellOffsets[atom].x -= xcell;
-//                        }
+                            posCellOffsets[atom].y -= ycell;
-//                    }
+                            posCellOffsets[atom].z -= zcell;
-//                }
+                        }
-//            }
+                    }
-//        }
+                }
-//
+            }
-//        // Select a bin for each molecule, then sort them by bin.
+        }
-//
-//        bool useHilbert = (numMolecules > 5000 || atoms.size() > 8); // For small systems, a simple zigzag curve works better than a Hilbert curve.
+        // Select a bin for each molecule, then sort them by bin.
-//        float binWidth;
-//        if (useHilbert)
+        bool useHilbert = (numMolecules > 5000 || atoms.size() > 8); // For small systems, a simple zigzag curve works better than a Hilbert curve.
-//            binWidth = (float)(max(max(maxx-minx, maxy-miny), maxz-minz)/255.0);
+        float binWidth;
-//        else
+        if (useHilbert)
-//            binWidth = (float)(0.2*nonbonded->getCutoffDistance());
+            binWidth = (float)(max(max(maxx-minx, maxy-miny), maxz-minz)/255.0);
-//        float invBinWidth = 1.0f/binWidth;
+        else
-//        int xbins = 1 + (int) ((maxx-minx)*invBinWidth);
+            binWidth = (float)(0.2*nonbonded->getCutoffDistance());
-//        int ybins = 1 + (int) ((maxy-miny)*invBinWidth);
+        float invBinWidth = 1.0f/binWidth;
-//        vector<pair<int, int> > molBins(numMolecules);
+        int xbins = 1 + (int) ((maxx-minx)*invBinWidth);
-//        bitmask_t coords[3];
+        int ybins = 1 + (int) ((maxy-miny)*invBinWidth);
-//        for (int i = 0; i < numMolecules; i++) {
+        vector<pair<int, int> > molBins(numMolecules);
-//            int x = (int) ((molPos[i].x-minx)*invBinWidth);
+        bitmask_t coords[3];
-//            int y = (int) ((molPos[i].y-miny)*invBinWidth);
+        for (int i = 0; i < numMolecules; i++) {
-//            int z = (int) ((molPos[i].z-minz)*invBinWidth);
+            int x = (int) ((molPos[i].x-minx)*invBinWidth);
-//            int bin;
+            int y = (int) ((molPos[i].y-miny)*invBinWidth);
-//            if (useHilbert) {
+            int z = (int) ((molPos[i].z-minz)*invBinWidth);
-//                coords[0] = x;
+            int bin;
-//                coords[1] = y;
+            if (useHilbert) {
-//                coords[2] = z;
+                coords[0] = x;
-//                bin = (int) hilbert_c2i(3, 8, coords);
+                coords[1] = y;
-//            }
+                coords[2] = z;
-//            else {
+                bin = (int) hilbert_c2i(3, 8, coords);
-//                int yodd = y&1;
+            }
-//                int zodd = z&1;
+            else {
-//                bin = z*xbins*ybins;
+                int yodd = y&1;
-//                bin += (zodd ? ybins-y : y)*xbins;
+                int zodd = z&1;
-//                bin += (yodd ? xbins-x : x);
+                bin = z*xbins*ybins;
-//            }
+                bin += (zodd ? ybins-y : y)*xbins;
-//            molBins[i] = pair<int, int>(bin, i);
+                bin += (yodd ? xbins-x : x);
-//        }
+            }
-//        sort(molBins.begin(), molBins.end());
+            molBins[i] = pair<int, int>(bin, i);
-//
+        }
-//        // Reorder the atoms.
+        sort(molBins.begin(), molBins.end());
-//
-//        for (int i = 0; i < numMolecules; i++) {
+        // Reorder the atoms.
-//            for (int j = 0; j < (int)atoms.size(); j++) {
-//                int oldIndex = mol.offsets[molBins[i].second]+atoms[j];
+        for (int i = 0; i < numMolecules; i++) {
-//                int newIndex = mol.offsets[i]+atoms[j];
+            for (int j = 0; j < (int)atoms.size(); j++) {
-//                originalIndex[newIndex] = atomIndex->get(oldIndex);
+                int oldIndex = mol.offsets[molBins[i].second]+atoms[j];
-//                newPosq[newIndex] = posq->get(oldIndex);
+                int newIndex = mol.offsets[i]+atoms[j];
-//                newVelm[newIndex] = velm->get(oldIndex);
+                originalIndex[newIndex] = atomIndex[oldIndex];
-//                newCellOffsets[newIndex] = posCellOffsets[oldIndex];
+                newPosq[newIndex] = oldPosq[oldIndex];
-//            }
+                newVelm[newIndex] = oldVelm[oldIndex];
-//        }
+                newCellOffsets[newIndex] = posCellOffsets[oldIndex];
-//    }
+            }
-//
+        }
-//    // Update the streams.
+    }
-//
-//    for (int i = 0; i < numAtoms; i++) {
+    // Update the streams.
-//        posq->set(i, newPosq[i]);
-//        velm->set(i, newVelm[i]);
+    for (int i = 0; i < numAtoms; i++) {
-//        atomIndex->set(i, originalIndex[i]);
+        atomIndex[i] = originalIndex[i];
-//        posCellOffsets[i] = newCellOffsets[i];
+        posCellOffsets[i] = newCellOffsets[i];
-//    }
+    }
-//    posq->upload();
+    posq->upload(newPosq);
-//    velm->upload();
+    velm->upload(newVelm);
-//    atomIndex->upload();
+    atomIndexDevice->upload(atomIndex);
-//    for (int i = 0; i < (int) reorderListeners.size(); i++)
+    for (int i = 0; i < (int) reorderListeners.size(); i++)
-//        reorderListeners[i]->execute();
+        reorderListeners[i]->execute();
-//}
+}
 struct CudaContext::WorkThread::ThreadData {
    ThreadData(std::queue<CudaContext::WorkTask*>& tasks, bool& waiting,  bool& finished,

--- a/platforms/cuda2/src/CudaContext.h
+++ b/platforms/cuda2/src/CudaContext.h
@@ -324,6 +324,8 @@ public:
    void setPeriodicBoxSize(double xsize, double ysize, double zsize) {
        periodicBoxSize = make_double4(xsize, ysize, zsize, 0.0);
        invPeriodicBoxSize = make_double4(1.0/xsize, 1.0/ysize, 1.0/zsize, 0.0);
+        periodicBoxSizeFloat = make_float4((float) xsize, (float) ysize, (float) zsize, 0.0f);
+        invPeriodicBoxSizeFloat = make_float4(1.0f/(float) xsize, 1.0f/(float) ysize, 1.0f/(float) zsize, 0.0f);
    }
    /**
     * Get the inverse of the size of the periodic box.
@@ -331,6 +333,20 @@ public:
    double4 getInvPeriodicBoxSize() const {
        return invPeriodicBoxSize;
    }
+    /**
+     * Get a pointer to the size of the periodic box, represented as either a float4 or double4 depending on
+     * this context's precision.  This value is suitable for passing to kernels as an argument.
+     */
+    void* getPeriodicBoxSizePointer() {
+        return (useDoublePrecision ? reinterpret_cast<void*>(&periodicBoxSize) : reinterpret_cast<void*>(&periodicBoxSizeFloat));
+    }
+    /**
+     * Get a pointer to the inverse of the size of the periodic box, represented as either a float4 or double4 depending on
+     * this context's precision.  This value is suitable for passing to kernels as an argument.
+     */
+    void* getInvPeriodicBoxSizePointer() {
+        return (useDoublePrecision ? reinterpret_cast<void*>(&invPeriodicBoxSize) : reinterpret_cast<void*>(&invPeriodicBoxSizeFloat));
+    }
    /**
     * Get the CudaIntegrationUtilities for this context.
     */
@@ -349,12 +365,12 @@ public:
    CudaBondedUtilities& getBondedUtilities() {
        return *bonded;
    }
-//    /**
+    /**
-//     * Get the CudaNonbondedUtilities for this context.
+     * Get the CudaNonbondedUtilities for this context.
-//     */
+     */
-//    CudaNonbondedUtilities& getNonbondedUtilities() {
+    CudaNonbondedUtilities& getNonbondedUtilities() {
-//        return *nonbonded;
+        return *nonbonded;
-//    }
+    }
    /**
     * Get the thread used by this context for executing parallel computations.
     */
@@ -429,8 +445,8 @@ private:
    int numThreadBlocks;
    bool useBlockingSync, useDoublePrecision, accumulateInDouble, contextIsValid, atomsWereReordered, moleculesInvalid;
    std::string compiler, tempDir, gpuArchitecture;
-    double4 periodicBoxSize;
+    float4 periodicBoxSizeFloat, invPeriodicBoxSizeFloat;
-    double4 invPeriodicBoxSize;
+    double4 periodicBoxSize, invPeriodicBoxSize;
    std::string defaultOptimizationOptions;
    std::map<std::string, std::string> compilationDefines;
    CUcontext context;
@@ -458,7 +474,7 @@ private:
    CudaIntegrationUtilities* integration;
    CudaExpressionUtilities* expression;
    CudaBondedUtilities* bonded;
-//    CudaNonbondedUtilities* nonbonded;
+    CudaNonbondedUtilities* nonbonded;
    WorkThread* thread;
 };

--- a/platforms/cuda2/src/CudaKernelFactory.cpp
+++ b/platforms/cuda2/src/CudaKernelFactory.cpp
@@ -92,8 +92,8 @@ KernelImpl* CudaKernelFactory::createKernelImpl(std::string name, const Platform
        return new CudaCalcCMAPTorsionForceKernel(name, platform, cu, context.getSystem());
    if (name == CalcCustomTorsionForceKernel::Name())
        return new CudaCalcCustomTorsionForceKernel(name, platform, cu, context.getSystem());
-//    if (name == CalcNonbondedForceKernel::Name())
+    if (name == CalcNonbondedForceKernel::Name())
-//        return new CudaCalcNonbondedForceKernel(name, platform, cu, context.getSystem());
+        return new CudaCalcNonbondedForceKernel(name, platform, cu, context.getSystem());
 //    if (name == CalcCustomNonbondedForceKernel::Name())
 //        return new CudaCalcCustomNonbondedForceKernel(name, platform, cu, context.getSystem());
 //    if (name == CalcGBSAOBCForceKernel::Name())

--- a/platforms/cuda2/src/CudaKernels.cpp
+++ b/platforms/cuda2/src/CudaKernels.cpp
@@ -83,23 +83,23 @@ void CudaCalcForcesAndEnergyKernel::initialize(const System& system) {
 void CudaCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForces, bool includeEnergy, int groups) {
    cuCtxSetCurrent(cu.getContext());
-//    CudaNonbondedUtilities& nb = cu.getNonbondedUtilities();
+    CudaNonbondedUtilities& nb = cu.getNonbondedUtilities();
-//    bool includeNonbonded = ((groups&(1<<nb.getForceGroup())) != 0);
+    bool includeNonbonded = ((groups&(1<<nb.getForceGroup())) != 0);
-//    cu.setAtomsWereReordered(false);
+    cu.setAtomsWereReordered(false);
-//    if (nb.getUseCutoff() && includeNonbonded && (cu.getMoleculesAreInvalid() || cu.getComputeForceCount()%100 == 0)) {
+    if (nb.getUseCutoff() && includeNonbonded && (cu.getMoleculesAreInvalid() || cu.getComputeForceCount()%100 == 0)) {
-//        cu.reorderAtoms(!cu.getMoleculesAreInvalid());
+        cu.reorderAtoms(!cu.getMoleculesAreInvalid());
-//        nb.updateNeighborListSize();
+        nb.updateNeighborListSize();
-//    }
+    }
    cu.setComputeForceCount(cu.getComputeForceCount()+1);
    cu.clearAutoclearBuffers();
-//    if (includeNonbonded)
+    if (includeNonbonded)
-//        nb.prepareInteractions();
+        nb.prepareInteractions();
 }
 double CudaCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, bool includeForces, bool includeEnergy, int groups) {
    cu.getBondedUtilities().computeInteractions(groups);
-//    if ((groups&(1<<cu.getNonbondedUtilities().getForceGroup())) != 0)
+    if ((groups&(1<<cu.getNonbondedUtilities().getForceGroup())) != 0)
-//        cu.getNonbondedUtilities().computeInteractions();
+        cu.getNonbondedUtilities().computeInteractions();
    cu.getIntegrationUtilities().distributeForcesFromVirtualSites();
    double sum = 0.0;
    if (includeEnergy) {
@@ -334,8 +334,8 @@ void CudaApplyConstraintsKernel::apply(ContextImpl& context, double tol) {
 //        hasInitializedKernel = true;
 //        map<string, string> defines;
 //        defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
-//        cu::Program program = cu.createProgram(CudaKernelSources::constraints, defines);
+//        CUmodule module = cu.createModule(CudaKernelSources::constraints, defines);
-//        applyDeltasKernel = cu::Kernel(program, "applyPositionDeltas");
+//        applyDeltasKernel = cu.getKernel(module, "applyPositionDeltas");
 //        applyDeltasKernel.setArg<cu::Buffer>(0, cu.getPosq().getDevicePointer());
 //        applyDeltasKernel.setArg<cu::Buffer>(1, cu.getIntegrationUtilities().getPosDelta().getDevicePointer());
 //    }
@@ -380,11 +380,13 @@ private:
 };
 CudaCalcHarmonicBondForceKernel::~CudaCalcHarmonicBondForceKernel() {
+    cuCtxSetCurrent(cu.getContext());
    if (params != NULL)
        delete params;
 }
 void CudaCalcHarmonicBondForceKernel::initialize(const System& system, const HarmonicBondForce& force) {
+    cuCtxSetCurrent(cu.getContext());
    int numContexts = cu.getPlatformData().contexts.size();
    int startIndex = cu.getContextIndex()*force.getNumBonds()/numContexts;
    int endIndex = (cu.getContextIndex()+1)*force.getNumBonds()/numContexts;
@@ -465,6 +467,7 @@ private:
 };
 CudaCalcCustomBondForceKernel::~CudaCalcCustomBondForceKernel() {
+    cuCtxSetCurrent(cu.getContext());
    if (params != NULL)
        delete params;
    if (globals != NULL)
@@ -472,6 +475,7 @@ CudaCalcCustomBondForceKernel::~CudaCalcCustomBondForceKernel() {
 }
 void CudaCalcCustomBondForceKernel::initialize(const System& system, const CustomBondForce& force) {
+    cuCtxSetCurrent(cu.getContext());
    int numContexts = cu.getPlatformData().contexts.size();
    int startIndex = cu.getContextIndex()*force.getNumBonds()/numContexts;
    int endIndex = (cu.getContextIndex()+1)*force.getNumBonds()/numContexts;
@@ -525,7 +529,7 @@ void CudaCalcCustomBondForceKernel::initialize(const System& system, const Custo
    }
    stringstream compute;
    for (int i = 0; i < (int) params->getBuffers().size(); i++) {
-        const CudaNonbondedUtilities::ParameterInfo& buffer = params->getBuffers()[i];
+        CudaNonbondedUtilities::ParameterInfo& buffer = params->getBuffers()[i];
        string argName = cu.getBondedUtilities().addArgument(buffer.getMemory(), buffer.getType());
        compute<<buffer.getType()<<" bondParams"<<(i+1)<<" = "<<argName<<"[index];\n";
    }
@@ -605,11 +609,13 @@ private:
 };
 CudaCalcHarmonicAngleForceKernel::~CudaCalcHarmonicAngleForceKernel() {
+    cuCtxSetCurrent(cu.getContext());
    if (params != NULL)
        delete params;
 }
 void CudaCalcHarmonicAngleForceKernel::initialize(const System& system, const HarmonicAngleForce& force) {
+    cuCtxSetCurrent(cu.getContext());
    int numContexts = cu.getPlatformData().contexts.size();
    int startIndex = cu.getContextIndex()*force.getNumAngles()/numContexts;
    int endIndex = (cu.getContextIndex()+1)*force.getNumAngles()/numContexts;
@@ -692,6 +698,7 @@ private:
 };
 CudaCalcCustomAngleForceKernel::~CudaCalcCustomAngleForceKernel() {
+    cuCtxSetCurrent(cu.getContext());
    if (params != NULL)
        delete params;
    if (globals != NULL)
@@ -699,6 +706,7 @@ CudaCalcCustomAngleForceKernel::~CudaCalcCustomAngleForceKernel() {
 }
 void CudaCalcCustomAngleForceKernel::initialize(const System& system, const CustomAngleForce& force) {
+    cuCtxSetCurrent(cu.getContext());
    int numContexts = cu.getPlatformData().contexts.size();
    int startIndex = cu.getContextIndex()*force.getNumAngles()/numContexts;
    int endIndex = (cu.getContextIndex()+1)*force.getNumAngles()/numContexts;
@@ -752,7 +760,7 @@ void CudaCalcCustomAngleForceKernel::initialize(const System& system, const Cust
    }
    stringstream compute;
    for (int i = 0; i < (int) params->getBuffers().size(); i++) {
-        const CudaNonbondedUtilities::ParameterInfo& buffer = params->getBuffers()[i];
+        CudaNonbondedUtilities::ParameterInfo& buffer = params->getBuffers()[i];
        string argName = cu.getBondedUtilities().addArgument(buffer.getMemory(), buffer.getType());
        compute<<buffer.getType()<<" angleParams"<<(i+1)<<" = "<<argName<<"[index];\n";
    }
@@ -838,6 +846,7 @@ CudaCalcPeriodicTorsionForceKernel::~CudaCalcPeriodicTorsionForceKernel() {
 }
 void CudaCalcPeriodicTorsionForceKernel::initialize(const System& system, const PeriodicTorsionForce& force) {
+    cuCtxSetCurrent(cu.getContext());
    int numContexts = cu.getPlatformData().contexts.size();
    int startIndex = cu.getContextIndex()*force.getNumTorsions()/numContexts;
    int endIndex = (cu.getContextIndex()+1)*force.getNumTorsions()/numContexts;
@@ -925,6 +934,7 @@ CudaCalcRBTorsionForceKernel::~CudaCalcRBTorsionForceKernel() {
 }
 void CudaCalcRBTorsionForceKernel::initialize(const System& system, const RBTorsionForce& force) {
+    cuCtxSetCurrent(cu.getContext());
    int numContexts = cu.getPlatformData().contexts.size();
    int startIndex = cu.getContextIndex()*force.getNumTorsions()/numContexts;
    int endIndex = (cu.getContextIndex()+1)*force.getNumTorsions()/numContexts;
@@ -1024,6 +1034,7 @@ CudaCalcCMAPTorsionForceKernel::~CudaCalcCMAPTorsionForceKernel() {
 }
 void CudaCalcCMAPTorsionForceKernel::initialize(const System& system, const CMAPTorsionForce& force) {
+    cuCtxSetCurrent(cu.getContext());
    int numContexts = cu.getPlatformData().contexts.size();
    int startIndex = cu.getContextIndex()*force.getNumTorsions()/numContexts;
    int endIndex = (cu.getContextIndex()+1)*force.getNumTorsions()/numContexts;
@@ -1110,6 +1121,7 @@ CudaCalcCustomTorsionForceKernel::~CudaCalcCustomTorsionForceKernel() {
 }
 void CudaCalcCustomTorsionForceKernel::initialize(const System& system, const CustomTorsionForce& force) {
+    cuCtxSetCurrent(cu.getContext());
    int numContexts = cu.getPlatformData().contexts.size();
    int startIndex = cu.getContextIndex()*force.getNumTorsions()/numContexts;
    int endIndex = (cu.getContextIndex()+1)*force.getNumTorsions()/numContexts;
@@ -1163,7 +1175,7 @@ void CudaCalcCustomTorsionForceKernel::initialize(const System& system, const Cu
    }
    stringstream compute;
    for (int i = 0; i < (int) params->getBuffers().size(); i++) {
-        const CudaNonbondedUtilities::ParameterInfo& buffer = params->getBuffers()[i];
+        CudaNonbondedUtilities::ParameterInfo& buffer = params->getBuffers()[i];
        string argName = cu.getBondedUtilities().addArgument(buffer.getMemory(), buffer.getType());
        compute<<buffer.getType()<<" torsionParams"<<(i+1)<<" = "<<argName<<"[index];\n";
    }
@@ -1215,475 +1227,449 @@ void CudaCalcCustomTorsionForceKernel::copyParametersToContext(ContextImpl& cont
    cu.invalidateMolecules();
 }
-//class CudaNonbondedForceInfo : public CudaForceInfo {
+class CudaNonbondedForceInfo : public CudaForceInfo {
-//public:
+public:
-//    CudaNonbondedForceInfo(int requiredBuffers, const NonbondedForce& force) : CudaForceInfo(requiredBuffers), force(force) {
+    CudaNonbondedForceInfo(const NonbondedForce& force) : force(force) {
-//    }
+    }
-//    bool areParticlesIdentical(int particle1, int particle2) {
+    bool areParticlesIdentical(int particle1, int particle2) {
-//        double charge1, charge2, sigma1, sigma2, epsilon1, epsilon2;
+        double charge1, charge2, sigma1, sigma2, epsilon1, epsilon2;
-//        force.getParticleParameters(particle1, charge1, sigma1, epsilon1);
+        force.getParticleParameters(particle1, charge1, sigma1, epsilon1);
-//        force.getParticleParameters(particle2, charge2, sigma2, epsilon2);
+        force.getParticleParameters(particle2, charge2, sigma2, epsilon2);
-//        return (charge1 == charge2 && sigma1 == sigma2 && epsilon1 == epsilon2);
+        return (charge1 == charge2 && sigma1 == sigma2 && epsilon1 == epsilon2);
-//    }
+    }
-//    int getNumParticleGroups() {
+    int getNumParticleGroups() {
-//        return force.getNumExceptions();
+        return force.getNumExceptions();
-//    }
+    }
-//    void getParticlesInGroup(int index, vector<int>& particles) {
+    void getParticlesInGroup(int index, vector<int>& particles) {
-//        int particle1, particle2;
+        int particle1, particle2;
-//        double chargeProd, sigma, epsilon;
+        double chargeProd, sigma, epsilon;
-//        force.getExceptionParameters(index, particle1, particle2, chargeProd, sigma, epsilon);
+        force.getExceptionParameters(index, particle1, particle2, chargeProd, sigma, epsilon);
-//        particles.resize(2);
+        particles.resize(2);
-//        particles[0] = particle1;
+        particles[0] = particle1;
-//        particles[1] = particle2;
+        particles[1] = particle2;
-//    }
+    }
-//    bool areGroupsIdentical(int group1, int group2) {
+    bool areGroupsIdentical(int group1, int group2) {
-//        int particle1, particle2;
+        int particle1, particle2;
-//        double chargeProd1, chargeProd2, sigma1, sigma2, epsilon1, epsilon2;
+        double chargeProd1, chargeProd2, sigma1, sigma2, epsilon1, epsilon2;
-//        force.getExceptionParameters(group1, particle1, particle2, chargeProd1, sigma1, epsilon1);
+        force.getExceptionParameters(group1, particle1, particle2, chargeProd1, sigma1, epsilon1);
-//        force.getExceptionParameters(group2, particle1, particle2, chargeProd2, sigma2, epsilon2);
+        force.getExceptionParameters(group2, particle1, particle2, chargeProd2, sigma2, epsilon2);
-//        return (chargeProd1 == chargeProd2 && sigma1 == sigma2 && epsilon1 == epsilon2);
+        return (chargeProd1 == chargeProd2 && sigma1 == sigma2 && epsilon1 == epsilon2);
-//    }
+    }
-//private:
+private:
-//    const NonbondedForce& force;
+    const NonbondedForce& force;
-//};
+};
-//
-//CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() {
+CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() {
-//    if (sigmaEpsilon != NULL)
+    cuCtxSetCurrent(cu.getContext());
-//        delete sigmaEpsilon;
+    if (sigmaEpsilon != NULL)
-//    if (exceptionParams != NULL)
+        delete sigmaEpsilon;
-//        delete exceptionParams;
+    if (exceptionParams != NULL)
-//    if (cosSinSums != NULL)
+        delete exceptionParams;
-//        delete cosSinSums;
+    if (cosSinSums != NULL)
-//    if (pmeGrid != NULL)
+        delete cosSinSums;
-//        delete pmeGrid;
+    if (pmeGrid != NULL)
-//    if (pmeGrid2 != NULL)
+        delete pmeGrid;
-//        delete pmeGrid2;
+    if (pmeBsplineModuliX != NULL)
-//    if (pmeBsplineModuliX != NULL)
+        delete pmeBsplineModuliX;
-//        delete pmeBsplineModuliX;
+    if (pmeBsplineModuliY != NULL)
-//    if (pmeBsplineModuliY != NULL)
+        delete pmeBsplineModuliY;
-//        delete pmeBsplineModuliY;
+    if (pmeBsplineModuliZ != NULL)
-//    if (pmeBsplineModuliZ != NULL)
+        delete pmeBsplineModuliZ;
-//        delete pmeBsplineModuliZ;
+    if (pmeBsplineTheta != NULL)
-//    if (pmeBsplineTheta != NULL)
+        delete pmeBsplineTheta;
-//        delete pmeBsplineTheta;
+    if (pmeBsplineDTheta != NULL)
-//    if (pmeBsplineDTheta != NULL)
+        delete pmeBsplineDTheta;
-//        delete pmeBsplineDTheta;
+    if (pmeAtomRange != NULL)
-//    if (pmeAtomRange != NULL)
+        delete pmeAtomRange;
-//        delete pmeAtomRange;
+    if (pmeAtomGridIndex != NULL)
-//    if (pmeAtomGridIndex != NULL)
+        delete pmeAtomGridIndex;
-//        delete pmeAtomGridIndex;
+    if (sort != NULL)
-//    if (sort != NULL)
+        delete sort;
-//        delete sort;
+    if (hasInitializedFFT)
-//    if (fft != NULL)
+        cufftDestroy(fft);
-//        delete fft;
+}
-//}
-//
+/**
-//void CudaCalcNonbondedForceKernel::initialize(const System& system, const NonbondedForce& force) {
+ * Select a size for an FFT that is a multiple of 2, 3, 5, and 7.
-//
+ */
-//    // Identify which exceptions are 1-4 interactions.
+static int findFFTDimension(int minimum) {
-//
+    if (minimum < 1)
-//    vector<pair<int, int> > exclusions;
+        return 1;
-//    vector<int> exceptions;
+    while (true) {
-//    for (int i = 0; i < force.getNumExceptions(); i++) {
+        // Attempt to factor the current value.
-//        int particle1, particle2;
-//        double chargeProd, sigma, epsilon;
+        int unfactored = minimum;
-//        force.getExceptionParameters(i, particle1, particle2, chargeProd, sigma, epsilon);
+        for (int factor = 2; factor < 8; factor++) {
-//        exclusions.push_back(pair<int, int>(particle1, particle2));
+            while (unfactored > 1 && unfactored%factor == 0)
-//        if (chargeProd != 0.0 || epsilon != 0.0)
+                unfactored /= factor;
-//            exceptions.push_back(i);
+        }
-//    }
+        if (unfactored == 1)
-//
+            return minimum;
-//    // Initialize nonbonded interactions.
+        minimum++;
-//
+    }
-//    int numParticles = force.getNumParticles();
+}
-//    sigmaEpsilon = new CudaArray<mm_float2>(cu, numParticles, "sigmaEpsilon");
-//    CudaArray<mm_float4>& posq = cu.getPosq();
+void CudaCalcNonbondedForceKernel::initialize(const System& system, const NonbondedForce& force) {
-//    vector<mm_float2> sigmaEpsilonVector(numParticles);
+    cuCtxSetCurrent(cu.getContext());
-//    vector<vector<int> > exclusionList(numParticles);
-//    double sumSquaredCharges = 0.0;
+    // Identify which exceptions are 1-4 interactions.
-//    hasCoulomb = false;
-//    hasLJ = false;
+    vector<pair<int, int> > exclusions;
-//    for (int i = 0; i < numParticles; i++) {
+    vector<int> exceptions;
-//        double charge, sigma, epsilon;
+    for (int i = 0; i < force.getNumExceptions(); i++) {
-//        force.getParticleParameters(i, charge, sigma, epsilon);
+        int particle1, particle2;
-//        posq[i].w = (float) charge;
+        double chargeProd, sigma, epsilon;
-//        sigmaEpsilonVector[i] = mm_float2((float) (0.5*sigma), (float) (2.0*sqrt(epsilon)));
+        force.getExceptionParameters(i, particle1, particle2, chargeProd, sigma, epsilon);
-//        exclusionList[i].push_back(i);
+        exclusions.push_back(pair<int, int>(particle1, particle2));
-//        sumSquaredCharges += charge*charge;
+        if (chargeProd != 0.0 || epsilon != 0.0)
-//        if (charge != 0.0)
+            exceptions.push_back(i);
-//            hasCoulomb = true;
+    }
-//        if (epsilon != 0.0)
-//            hasLJ = true;
+    // Initialize nonbonded interactions.
-//    }
-//    for (int i = 0; i < (int) exclusions.size(); i++) {
+    int numParticles = force.getNumParticles();
-//        exclusionList[exclusions[i].first].push_back(exclusions[i].second);
+    sigmaEpsilon = CudaArray::create<float2>(numParticles, "sigmaEpsilon");
-//        exclusionList[exclusions[i].second].push_back(exclusions[i].first);
+    CudaArray& posq = cu.getPosq();
-//    }
+    float4* posqf = (float4*) cu.getPinnedBuffer();
-//    posq.upload();
+    double4* posqd = (double4*) cu.getPinnedBuffer();
-//    sigmaEpsilon->upload(sigmaEpsilonVector);
+    vector<float2> sigmaEpsilonVector(numParticles);
-//    bool useCutoff = (force.getNonbondedMethod() != NonbondedForce::NoCutoff);
+    vector<vector<int> > exclusionList(numParticles);
-//    bool usePeriodic = (force.getNonbondedMethod() != NonbondedForce::NoCutoff && force.getNonbondedMethod() != NonbondedForce::CutoffNonPeriodic);
+    double sumSquaredCharges = 0.0;
-//    map<string, string> defines;
+    hasCoulomb = false;
-//    defines["HAS_COULOMB"] = (hasCoulomb ? "1" : "0");
+    hasLJ = false;
-//    defines["HAS_LENNARD_JONES"] = (hasLJ ? "1" : "0");
+    for (int i = 0; i < numParticles; i++) {
-//    if (useCutoff) {
+        double charge, sigma, epsilon;
-//        // Compute the reaction field constants.
+        force.getParticleParameters(i, charge, sigma, epsilon);
-//
+        if (cu.getUseDoublePrecision())
-//        double reactionFieldK = pow(force.getCutoffDistance(), -3.0)*(force.getReactionFieldDielectric()-1.0)/(2.0*force.getReactionFieldDielectric()+1.0);
+            posqd[i] = make_double4(0, 0, 0, charge);
-//        double reactionFieldC = (1.0 / force.getCutoffDistance())*(3.0*force.getReactionFieldDielectric())/(2.0*force.getReactionFieldDielectric()+1.0);
+        else
-//        defines["REACTION_FIELD_K"] = cu.doubleToString(reactionFieldK);
+            posqf[i] = make_float4(0, 0, 0, (float) charge);
-//        defines["REACTION_FIELD_C"] = cu.doubleToString(reactionFieldC);
+        sigmaEpsilonVector[i] = make_float2((float) (0.5*sigma), (float) (2.0*sqrt(epsilon)));
-//    }
+        exclusionList[i].push_back(i);
-//    if (force.getUseDispersionCorrection() && cu.getContextIndex() == 0)
+        sumSquaredCharges += charge*charge;
-//        dispersionCoefficient = NonbondedForceImpl::calcDispersionCorrection(system, force);
+        if (charge != 0.0)
-//    else
+            hasCoulomb = true;
-//        dispersionCoefficient = 0.0;
+        if (epsilon != 0.0)
-//    alpha = 0;
+            hasLJ = true;
-//    if (force.getNonbondedMethod() == NonbondedForce::Ewald) {
+    }
-//        // Compute the Ewald parameters.
+    for (int i = 0; i < (int) exclusions.size(); i++) {
-//
+        exclusionList[exclusions[i].first].push_back(exclusions[i].second);
-//        int kmaxx, kmaxy, kmaxz;
+        exclusionList[exclusions[i].second].push_back(exclusions[i].first);
-//        NonbondedForceImpl::calcEwaldParameters(system, force, alpha, kmaxx, kmaxy, kmaxz);
+    }
-//        defines["EWALD_ALPHA"] = cu.doubleToString(alpha);
+    posq.upload(cu.getPinnedBuffer());
-//        defines["TWO_OVER_SQRT_PI"] = cu.doubleToString(2.0/sqrt(M_PI));
+    sigmaEpsilon->upload(sigmaEpsilonVector);
-//        defines["USE_EWALD"] = "1";
+    bool useCutoff = (force.getNonbondedMethod() != NonbondedForce::NoCutoff);
-//        ewaldSelfEnergy = (cu.getContextIndex() == 0 ? -ONE_4PI_EPS0*alpha*sumSquaredCharges/sqrt(M_PI) : 0.0);
+    bool usePeriodic = (force.getNonbondedMethod() != NonbondedForce::NoCutoff && force.getNonbondedMethod() != NonbondedForce::CutoffNonPeriodic);
-//
+    map<string, string> defines;
-//        // Create the reciprocal space kernels.
+    defines["HAS_COULOMB"] = (hasCoulomb ? "1" : "0");
-//
+    defines["HAS_LENNARD_JONES"] = (hasLJ ? "1" : "0");
-//        map<string, string> replacements;
+    if (useCutoff) {
-//        replacements["NUM_ATOMS"] = cu.intToString(numParticles);
+        // Compute the reaction field constants.
-//        replacements["KMAX_X"] = cu.intToString(kmaxx);
-//        replacements["KMAX_Y"] = cu.intToString(kmaxy);
+        double reactionFieldK = pow(force.getCutoffDistance(), -3.0)*(force.getReactionFieldDielectric()-1.0)/(2.0*force.getReactionFieldDielectric()+1.0);
-//        replacements["KMAX_Z"] = cu.intToString(kmaxz);
+        double reactionFieldC = (1.0 / force.getCutoffDistance())*(3.0*force.getReactionFieldDielectric())/(2.0*force.getReactionFieldDielectric()+1.0);
-//        replacements["EXP_COEFFICIENT"] = cu.doubleToString(-1.0/(4.0*alpha*alpha));
+        defines["REACTION_FIELD_K"] = cu.doubleToString(reactionFieldK);
-//        cu::Program program = cu.createProgram(CudaKernelSources::ewald, replacements);
+        defines["REACTION_FIELD_C"] = cu.doubleToString(reactionFieldC);
-//        ewaldSumsKernel = cu::Kernel(program, "calculateEwaldCosSinSums");
+    }
-//        ewaldForcesKernel = cu::Kernel(program, "calculateEwaldForces");
+    if (force.getUseDispersionCorrection() && cu.getContextIndex() == 0)
-//        cosSinSums = new CudaArray<mm_float2>(cu, (2*kmaxx-1)*(2*kmaxy-1)*(2*kmaxz-1), "cosSinSums");
+        dispersionCoefficient = NonbondedForceImpl::calcDispersionCorrection(system, force);
-//    }
+    else
-//    else if (force.getNonbondedMethod() == NonbondedForce::PME) {
+        dispersionCoefficient = 0.0;
-//        // Compute the PME parameters.
+    alpha = 0;
-//
+    if (force.getNonbondedMethod() == NonbondedForce::Ewald) {
-//        int gridSizeX, gridSizeY, gridSizeZ;
+        // Compute the Ewald parameters.
-//        NonbondedForceImpl::calcPMEParameters(system, force, alpha, gridSizeX, gridSizeY, gridSizeZ);
-//        gridSizeX = CudaFFT3D::findLegalDimension(gridSizeX);
+        int kmaxx, kmaxy, kmaxz;
-//        gridSizeY = CudaFFT3D::findLegalDimension(gridSizeY);
+        NonbondedForceImpl::calcEwaldParameters(system, force, alpha, kmaxx, kmaxy, kmaxz);
-//        gridSizeZ = CudaFFT3D::findLegalDimension(gridSizeZ);
+        defines["EWALD_ALPHA"] = cu.doubleToString(alpha);
-//        defines["EWALD_ALPHA"] = cu.doubleToString(alpha);
+        defines["TWO_OVER_SQRT_PI"] = cu.doubleToString(2.0/sqrt(M_PI));
-//        defines["TWO_OVER_SQRT_PI"] = cu.doubleToString(2.0/sqrt(M_PI));
+        defines["USE_EWALD"] = "1";
-//        defines["USE_EWALD"] = "1";
+        ewaldSelfEnergy = (cu.getContextIndex() == 0 ? -ONE_4PI_EPS0*alpha*sumSquaredCharges/sqrt(M_PI) : 0.0);
-//        ewaldSelfEnergy = (cu.getContextIndex() == 0 ? -ONE_4PI_EPS0*alpha*sumSquaredCharges/sqrt(M_PI) : 0.0);
-//        pmeDefines["PME_ORDER"] = cu.intToString(PmeOrder);
+        // Create the reciprocal space kernels.
-//        pmeDefines["NUM_ATOMS"] = cu.intToString(numParticles);
-//        pmeDefines["RECIP_EXP_FACTOR"] = cu.doubleToString(M_PI*M_PI/(alpha*alpha));
+        map<string, string> replacements;
-//        pmeDefines["GRID_SIZE_X"] = cu.intToString(gridSizeX);
+        replacements["NUM_ATOMS"] = cu.intToString(numParticles);
-//        pmeDefines["GRID_SIZE_Y"] = cu.intToString(gridSizeY);
+        replacements["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
-//        pmeDefines["GRID_SIZE_Z"] = cu.intToString(gridSizeZ);
+        replacements["KMAX_X"] = cu.intToString(kmaxx);
-//        pmeDefines["EPSILON_FACTOR"] = cu.doubleToString(sqrt(ONE_4PI_EPS0));
+        replacements["KMAX_Y"] = cu.intToString(kmaxy);
-//
+        replacements["KMAX_Z"] = cu.intToString(kmaxz);
-//        // Create required data structures.
+        replacements["EXP_COEFFICIENT"] = cu.doubleToString(-1.0/(4.0*alpha*alpha));
-//
+        replacements["ONE_4PI_EPS0"] = cu.doubleToString(ONE_4PI_EPS0);
-//        pmeGrid = new CudaArray<mm_float2>(cu, gridSizeX*gridSizeY*gridSizeZ, "pmeGrid");
+        CUmodule module = cu.createModule(CudaKernelSources::vectorOps+CudaKernelSources::ewald, replacements);
-//        cu.addAutoclearBuffer(pmeGrid->getDevicePointer(), pmeGrid->getSize()*2);
+        ewaldSumsKernel = cu.getKernel(module, "calculateEwaldCosSinSums");
-//        pmeGrid2 = new CudaArray<mm_float2>(cu, gridSizeX*gridSizeY*gridSizeZ, "pmeGrid2");
+        ewaldForcesKernel = cu.getKernel(module, "calculateEwaldForces");
-//        pmeBsplineModuliX = new CudaArray<cl_float>(cu, gridSizeX, "pmeBsplineModuliX");
+        int elementSize = (cu.getUseDoublePrecision() ? sizeof(double2) : sizeof(float2));
-//        pmeBsplineModuliY = new CudaArray<cl_float>(cu, gridSizeY, "pmeBsplineModuliY");
+        cosSinSums = new CudaArray((2*kmaxx-1)*(2*kmaxy-1)*(2*kmaxz-1), elementSize, "cosSinSums");
-//        pmeBsplineModuliZ = new CudaArray<cl_float>(cu, gridSizeZ, "pmeBsplineModuliZ");
+    }
-//        pmeBsplineTheta = new CudaArray<mm_float4>(cu, PmeOrder*numParticles, "pmeBsplineTheta");
+    else if (force.getNonbondedMethod() == NonbondedForce::PME) {
-//        bool deviceIsCpu = (cu.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU);
+        // Compute the PME parameters.
-//        if (deviceIsCpu)
-//            pmeBsplineDTheta = new CudaArray<mm_float4>(cu, PmeOrder*numParticles, "pmeBsplineDTheta");
+        int gridSizeX, gridSizeY, gridSizeZ;
-//        pmeAtomRange = new CudaArray<cl_int>(cu, gridSizeX*gridSizeY*gridSizeZ+1, "pmeAtomRange");
+        NonbondedForceImpl::calcPMEParameters(system, force, alpha, gridSizeX, gridSizeY, gridSizeZ);
-//        pmeAtomGridIndex = new CudaArray<mm_int2>(cu, numParticles, "pmeAtomGridIndex");
+        gridSizeX = findFFTDimension(gridSizeX);
-//        sort = new CudaSort<SortTrait>(cu, cu.getNumAtoms());
+        gridSizeY = findFFTDimension(gridSizeY);
-//        fft = new CudaFFT3D(cu, gridSizeX, gridSizeY, gridSizeZ);
+        gridSizeZ = findFFTDimension(gridSizeZ);
-//
+        defines["EWALD_ALPHA"] = cu.doubleToString(alpha);
-//        // Initialize the b-spline moduli.
+        defines["TWO_OVER_SQRT_PI"] = cu.doubleToString(2.0/sqrt(M_PI));
-//
+        defines["USE_EWALD"] = "1";
-//        int maxSize = max(max(gridSizeX, gridSizeY), gridSizeZ);
+        ewaldSelfEnergy = (cu.getContextIndex() == 0 ? -ONE_4PI_EPS0*alpha*sumSquaredCharges/sqrt(M_PI) : 0.0);
-//        vector<double> data(PmeOrder);
+        pmeDefines["PME_ORDER"] = cu.intToString(PmeOrder);
-//        vector<double> ddata(PmeOrder);
+        pmeDefines["NUM_ATOMS"] = cu.intToString(numParticles);
-//        vector<double> bsplines_data(maxSize);
+        pmeDefines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
-//        data[PmeOrder-1] = 0.0;
+        pmeDefines["RECIP_EXP_FACTOR"] = cu.doubleToString(M_PI*M_PI/(alpha*alpha));
-//        data[1] = 0.0;
+        pmeDefines["GRID_SIZE_X"] = cu.intToString(gridSizeX);
-//        data[0] = 1.0;
+        pmeDefines["GRID_SIZE_Y"] = cu.intToString(gridSizeY);
-//        for (int i = 3; i < PmeOrder; i++) {
+        pmeDefines["GRID_SIZE_Z"] = cu.intToString(gridSizeZ);
-//            double div = 1.0/(i-1.0);
+        pmeDefines["EPSILON_FACTOR"] = cu.doubleToString(sqrt(ONE_4PI_EPS0));
-//            data[i-1] = 0.0;
+        CUmodule module = cu.createModule(CudaKernelSources::vectorOps+CudaKernelSources::pme, pmeDefines);
-//            for (int j = 1; j < (i-1); j++)
+        pmeUpdateBsplinesKernel = cu.getKernel(module, "updateBsplines");
-//                data[i-j-1] = div*(j*data[i-j-2]+(i-j)*data[i-j-1]);
+        pmeAtomRangeKernel = cu.getKernel(module, "findAtomRangeForGrid");
-//            data[0] = div*data[0];
+        pmeSpreadChargeKernel = cu.getKernel(module, "gridSpreadCharge");
-//        }
+        pmeConvolutionKernel = cu.getKernel(module, "reciprocalConvolution");
-//
+        pmeInterpolateForceKernel = cu.getKernel(module, "gridInterpolateForce");
-//        // Differentiate.
+        pmeFinishSpreadChargeKernel = cu.getKernel(module, "finishSpreadCharge");
-//
-//        ddata[0] = -data[0];
+        // Create required data structures.
-//        for (int i = 1; i < PmeOrder; i++)
-//            ddata[i] = data[i-1]-data[i];
+        int elementSize = (cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
-//        double div = 1.0/(PmeOrder-1);
+        pmeGrid = new CudaArray(gridSizeX*gridSizeY*gridSizeZ, 2*elementSize, "pmeGrid");
-//        data[PmeOrder-1] = 0.0;
+        cu.addAutoclearBuffer(pmeGrid->getDevicePointer(), pmeGrid->getSize()*sizeof(float2));
-//        for (int i = 1; i < (PmeOrder-1); i++)
+        pmeBsplineModuliX = new CudaArray(gridSizeX, elementSize, "pmeBsplineModuliX");
-//            data[PmeOrder-i-1] = div*(i*data[PmeOrder-i-2]+(PmeOrder-i)*data[PmeOrder-i-1]);
+        pmeBsplineModuliY = new CudaArray(gridSizeY, elementSize, "pmeBsplineModuliY");
-//        data[0] = div*data[0];
+        pmeBsplineModuliZ = new CudaArray(gridSizeZ, elementSize, "pmeBsplineModuliZ");
-//        for (int i = 0; i < maxSize; i++)
+        pmeBsplineTheta = new CudaArray(PmeOrder*numParticles, 4*elementSize, "pmeBsplineTheta");
-//            bsplines_data[i] = 0.0;
+        pmeAtomRange = CudaArray::create<int>(gridSizeX*gridSizeY*gridSizeZ+1, "pmeAtomRange");
-//        for (int i = 1; i <= PmeOrder; i++)
+        pmeAtomGridIndex = CudaArray::create<int2>(numParticles, "pmeAtomGridIndex");
-//            bsplines_data[i] = data[i-1];
+        sort = new CudaSort(cu, new SortTrait(), cu.getNumAtoms());
-//
+        cufftResult result = cufftPlan3d(&fft, gridSizeX, gridSizeY, gridSizeZ, CUFFT_C2C);
-//        // Evaluate the actual bspline moduli for X/Y/Z.
+        if (result != CUFFT_SUCCESS)
-//
+            throw OpenMMException("Error initializing FFT: "+cu.intToString(result));
-//        for(int dim = 0; dim < 3; dim++) {
+        hasInitializedFFT = true;
-//            int ndata = (dim == 0 ? gridSizeX : dim == 1 ? gridSizeY : gridSizeZ);
-//            vector<cl_float> moduli(ndata);
+        // Initialize the b-spline moduli.
-//            for (int i = 0; i < ndata; i++) {
-//                double sc = 0.0;
+        int maxSize = max(max(gridSizeX, gridSizeY), gridSizeZ);
-//                double ss = 0.0;
+        vector<double> data(PmeOrder);
-//                for (int j = 0; j < ndata; j++) {
+        vector<double> ddata(PmeOrder);
-//                    double arg = (2.0*M_PI*i*j)/ndata;
+        vector<double> bsplines_data(maxSize);
-//                    sc += bsplines_data[j]*cos(arg);
+        data[PmeOrder-1] = 0.0;
-//                    ss += bsplines_data[j]*sin(arg);
+        data[1] = 0.0;
-//                }
+        data[0] = 1.0;
-//                moduli[i] = (float) (sc*sc+ss*ss);
+        for (int i = 3; i < PmeOrder; i++) {
-//            }
+            double div = 1.0/(i-1.0);
-//            for (int i = 0; i < ndata; i++)
+            data[i-1] = 0.0;
-//            {
+            for (int j = 1; j < (i-1); j++)
-//                if (moduli[i] < 1.0e-7)
+                data[i-j-1] = div*(j*data[i-j-2]+(i-j)*data[i-j-1]);
-//                    moduli[i] = (moduli[i-1]+moduli[i+1])*0.5f;
+            data[0] = div*data[0];
-//            }
+        }
-//            if (dim == 0)
-//                pmeBsplineModuliX->upload(moduli);
+        // Differentiate.
-//            else if (dim == 1)
-//                pmeBsplineModuliY->upload(moduli);
+        ddata[0] = -data[0];
-//            else
+        for (int i = 1; i < PmeOrder; i++)
-//                pmeBsplineModuliZ->upload(moduli);
+            ddata[i] = data[i-1]-data[i];
-//        }
+        double div = 1.0/(PmeOrder-1);
-//    }
+        data[PmeOrder-1] = 0.0;
-//    else
+        for (int i = 1; i < (PmeOrder-1); i++)
-//        ewaldSelfEnergy = 0.0;
+            data[PmeOrder-i-1] = div*(i*data[PmeOrder-i-2]+(PmeOrder-i)*data[PmeOrder-i-1]);
-//
+        data[0] = div*data[0];
-//    // Add the interaction to the default nonbonded kernel.
+        for (int i = 0; i < maxSize; i++)
-//    
+            bsplines_data[i] = 0.0;
-//    string source = cu.replaceStrings(CudaKernelSources::coulombLennardJones, defines);
+        for (int i = 1; i <= PmeOrder; i++)
-//    cu.getNonbondedUtilities().addInteraction(useCutoff, usePeriodic, true, force.getCutoffDistance(), exclusionList, source, force.getForceGroup());
+            bsplines_data[i] = data[i-1];
-//    if (hasLJ)
-//        cu.getNonbondedUtilities().addParameter(CudaNonbondedUtilities::ParameterInfo("sigmaEpsilon", "float", 2, sizeof(cl_float2), sigmaEpsilon->getDevicePointer()));
+        // Evaluate the actual bspline moduli for X/Y/Z.
-//
-//    // Initialize the exceptions.
+        for(int dim = 0; dim < 3; dim++) {
-//
+            int ndata = (dim == 0 ? gridSizeX : dim == 1 ? gridSizeY : gridSizeZ);
-//    int numContexts = cu.getPlatformData().contexts.size();
+            vector<double> moduli(ndata);
-//    int startIndex = cu.getContextIndex()*exceptions.size()/numContexts;
+            for (int i = 0; i < ndata; i++) {
-//    int endIndex = (cu.getContextIndex()+1)*exceptions.size()/numContexts;
+                double sc = 0.0;
-//    int numExceptions = endIndex-startIndex;
+                double ss = 0.0;
-//    if (numExceptions > 0) {
+                for (int j = 0; j < ndata; j++) {
-//        exceptionAtoms.resize(numExceptions);
+                    double arg = (2.0*M_PI*i*j)/ndata;
-//        vector<vector<int> > atoms(numExceptions, vector<int>(2));
+                    sc += bsplines_data[j]*cos(arg);
-//        exceptionParams = new CudaArray<mm_float4>(cu, numExceptions, "exceptionParams");
+                    ss += bsplines_data[j]*sin(arg);
-//        vector<mm_float4> exceptionParamsVector(numExceptions);
+                }
-//        for (int i = 0; i < numExceptions; i++) {
+                moduli[i] = sc*sc+ss*ss;
-//            double chargeProd, sigma, epsilon;
+            }
-//            force.getExceptionParameters(exceptions[startIndex+i], atoms[i][0], atoms[i][1], chargeProd, sigma, epsilon);
+            for (int i = 0; i < ndata; i++)
-//            exceptionParamsVector[i] = mm_float4((float) (ONE_4PI_EPS0*chargeProd), (float) sigma, (float) (4.0*epsilon), 0.0f);
+                if (moduli[i] < 1.0e-7)
-//            exceptionAtoms[i] = make_pair(atoms[i][0], atoms[i][1]);
+                    moduli[i] = (moduli[i-1]+moduli[i+1])*0.5;
-//        }
+            if (cu.getUseDoublePrecision()) {
-//        exceptionParams->upload(exceptionParamsVector);
+                if (dim == 0)
-//        map<string, string> replacements;
+                    pmeBsplineModuliX->upload(moduli);
-//        replacements["PARAMS"] = cu.getBondedUtilities().addArgument(exceptionParams->getDevicePointer(), "float4");
+                else if (dim == 1)
-//        cu.getBondedUtilities().addInteraction(atoms, cu.replaceStrings(CudaKernelSources::nonbondedExceptions, replacements), force.getForceGroup());
+                    pmeBsplineModuliY->upload(moduli);
-//    }
+                else
-//    cu.addForce(new CudaNonbondedForceInfo(cu.getNonbondedUtilities().getNumForceBuffers(), force));
+                    pmeBsplineModuliZ->upload(moduli);
-//}
+            }
-//
+            else {
-//double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy, bool includeDirect, bool includeReciprocal) {
+                vector<float> modulif(ndata);
-//    bool deviceIsCpu = (cu.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU);
+                for (int i = 0; i < ndata; i++)
-//    if (!hasInitializedKernel) {
+                    modulif[i] = (float) moduli[i];
-//        hasInitializedKernel = true;
+                if (dim == 0)
-//        if (cosSinSums != NULL) {
+                    pmeBsplineModuliX->upload(modulif);
-//            ewaldSumsKernel.setArg<cu::Buffer>(0, cu.getEnergyBuffer().getDevicePointer());
+                else if (dim == 1)
-//            ewaldSumsKernel.setArg<cu::Buffer>(1, cu.getPosq().getDevicePointer());
+                    pmeBsplineModuliY->upload(modulif);
-//            ewaldSumsKernel.setArg<cu::Buffer>(2, cosSinSums->getDevicePointer());
+                else
-//            ewaldForcesKernel.setArg<cu::Buffer>(0, cu.getForceBuffers().getDevicePointer());
+                    pmeBsplineModuliZ->upload(modulif);
-//            ewaldForcesKernel.setArg<cu::Buffer>(1, cu.getPosq().getDevicePointer());
+            }
-//            ewaldForcesKernel.setArg<cu::Buffer>(2, cosSinSums->getDevicePointer());
+        }
-//        }
+    }
-//        if (pmeGrid != NULL) {
+    else
-//            string file = (deviceIsCpu ? CudaKernelSources::pme_cpu : CudaKernelSources::pme);
+        ewaldSelfEnergy = 0.0;
-//            cu::Program program = cu.createProgram(file, pmeDefines);
-//            pmeUpdateBsplinesKernel = cu::Kernel(program, "updateBsplines");
+    // Add the interaction to the default nonbonded kernel.
-//            pmeAtomRangeKernel = cu::Kernel(program, "findAtomRangeForGrid");
-//	    if (!deviceIsCpu)
+    string source = cu.replaceStrings(CudaKernelSources::coulombLennardJones, defines);
-//                pmeZIndexKernel = cu::Kernel(program, "recordZIndex");
+    cu.getNonbondedUtilities().addInteraction(useCutoff, usePeriodic, true, force.getCutoffDistance(), exclusionList, source, force.getForceGroup());
-//            pmeSpreadChargeKernel = cu::Kernel(program, "gridSpreadCharge");
+    if (hasLJ)
-//            pmeConvolutionKernel = cu::Kernel(program, "reciprocalConvolution");
+        cu.getNonbondedUtilities().addParameter(CudaNonbondedUtilities::ParameterInfo("sigmaEpsilon", "float", 2, sizeof(float2), sigmaEpsilon->getDevicePointer()));
-//            pmeInterpolateForceKernel = cu::Kernel(program, "gridInterpolateForce");
-//            pmeUpdateBsplinesKernel.setArg<cu::Buffer>(0, cu.getPosq().getDevicePointer());
+    // Initialize the exceptions.
-//            pmeUpdateBsplinesKernel.setArg<cu::Buffer>(1, pmeBsplineTheta->getDevicePointer());
-//            pmeUpdateBsplinesKernel.setArg(2, CudaContext::ThreadBlockSize*PmeOrder*sizeof(mm_float4), NULL);
+    int numContexts = cu.getPlatformData().contexts.size();
-//            pmeUpdateBsplinesKernel.setArg<cu::Buffer>(3, pmeAtomGridIndex->getDevicePointer());
+    int startIndex = cu.getContextIndex()*exceptions.size()/numContexts;
-//            if (deviceIsCpu)
+    int endIndex = (cu.getContextIndex()+1)*exceptions.size()/numContexts;
-//                pmeUpdateBsplinesKernel.setArg<cu::Buffer>(6, pmeBsplineDTheta->getDevicePointer());
+    int numExceptions = endIndex-startIndex;
-//            pmeAtomRangeKernel.setArg<cu::Buffer>(0, pmeAtomGridIndex->getDevicePointer());
+    if (numExceptions > 0) {
-//            pmeAtomRangeKernel.setArg<cu::Buffer>(1, pmeAtomRange->getDevicePointer());
+        exceptionAtoms.resize(numExceptions);
-//            pmeAtomRangeKernel.setArg<cu::Buffer>(2, cu.getPosq().getDevicePointer());
+        vector<vector<int> > atoms(numExceptions, vector<int>(2));
-//	    if (!deviceIsCpu) {
+        exceptionParams = CudaArray::create<float4>(numExceptions, "exceptionParams");
-//                pmeZIndexKernel.setArg<cu::Buffer>(0, pmeAtomGridIndex->getDevicePointer());
+        vector<float4> exceptionParamsVector(numExceptions);
-//                pmeZIndexKernel.setArg<cu::Buffer>(1, cu.getPosq().getDevicePointer());
+        for (int i = 0; i < numExceptions; i++) {
-//	    }
+            double chargeProd, sigma, epsilon;
-//            pmeSpreadChargeKernel.setArg<cu::Buffer>(0, cu.getPosq().getDevicePointer());
+            force.getExceptionParameters(exceptions[startIndex+i], atoms[i][0], atoms[i][1], chargeProd, sigma, epsilon);
-//            pmeSpreadChargeKernel.setArg<cu::Buffer>(1, pmeAtomGridIndex->getDevicePointer());
+            exceptionParamsVector[i] = make_float4((float) (ONE_4PI_EPS0*chargeProd), (float) sigma, (float) (4.0*epsilon), 0.0f);
-//            pmeSpreadChargeKernel.setArg<cu::Buffer>(2, pmeAtomRange->getDevicePointer());
+            exceptionAtoms[i] = make_pair(atoms[i][0], atoms[i][1]);
-//            pmeSpreadChargeKernel.setArg<cu::Buffer>(3, pmeGrid->getDevicePointer());
+        }
-//            pmeSpreadChargeKernel.setArg<cu::Buffer>(4, pmeBsplineTheta->getDevicePointer());
+        exceptionParams->upload(exceptionParamsVector);
-//            pmeConvolutionKernel.setArg<cu::Buffer>(0, pmeGrid2->getDevicePointer());
+        map<string, string> replacements;
-//            pmeConvolutionKernel.setArg<cu::Buffer>(1, cu.getEnergyBuffer().getDevicePointer());
+        replacements["PARAMS"] = cu.getBondedUtilities().addArgument(exceptionParams->getDevicePointer(), "float4");
-//            pmeConvolutionKernel.setArg<cu::Buffer>(2, pmeBsplineModuliX->getDevicePointer());
+        cu.getBondedUtilities().addInteraction(atoms, cu.replaceStrings(CudaKernelSources::nonbondedExceptions, replacements), force.getForceGroup());
-//            pmeConvolutionKernel.setArg<cu::Buffer>(3, pmeBsplineModuliY->getDevicePointer());
+    }
-//            pmeConvolutionKernel.setArg<cu::Buffer>(4, pmeBsplineModuliZ->getDevicePointer());
+    cu.addForce(new CudaNonbondedForceInfo(force));
-//            interpolateForceThreads = (cu.getDevice().getInfo<CL_DEVICE_LOCAL_MEM_SIZE>() > 2*128*PmeOrder*sizeof(mm_float4) ? 128 : 64);
+}
-//            pmeInterpolateForceKernel.setArg<cu::Buffer>(0, cu.getPosq().getDevicePointer());
-//            pmeInterpolateForceKernel.setArg<cu::Buffer>(1, cu.getForceBuffers().getDevicePointer());
+double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy, bool includeDirect, bool includeReciprocal) {
-//            pmeInterpolateForceKernel.setArg<cu::Buffer>(2, pmeGrid->getDevicePointer());
+    if (cosSinSums != NULL && cu.getContextIndex() == 0 && includeReciprocal) {
-//            if (deviceIsCpu) {
+        void* sumsArgs[] = {&cu.getEnergyBuffer().getDevicePointer(), &cu.getPosq().getDevicePointer(), &cosSinSums->getDevicePointer(), cu.getPeriodicBoxSizePointer()};
-//                pmeInterpolateForceKernel.setArg<cu::Buffer>(5, pmeBsplineTheta->getDevicePointer());
+        cu.executeKernel(ewaldSumsKernel, sumsArgs, cosSinSums->getSize());
-//                pmeInterpolateForceKernel.setArg<cu::Buffer>(6, pmeBsplineDTheta->getDevicePointer());
+        void* forcesArgs[] = {&cu.getForce().getDevicePointer(), &cu.getPosq().getDevicePointer(), &cosSinSums->getDevicePointer(), cu.getPeriodicBoxSizePointer()};
-//            }
+        cu.executeKernel(ewaldForcesKernel, forcesArgs, cu.getNumAtoms());
-//            else
+    }
-//                pmeInterpolateForceKernel.setArg(5, 2*interpolateForceThreads*PmeOrder*sizeof(mm_float4), NULL);
+    if (pmeGrid != NULL && cu.getContextIndex() == 0 && includeReciprocal) {
-//            if (cu.getSupports64BitGlobalAtomics()) {
+        void* bsplinesArgs[] = {&cu.getPosq().getDevicePointer(), &pmeBsplineTheta->getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(),
-//                pmeFinishSpreadChargeKernel = cu::Kernel(program, "finishSpreadCharge");
+                cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()};
-//                pmeFinishSpreadChargeKernel.setArg<cu::Buffer>(0, pmeGrid->getDevicePointer());
+        int bsplinesSharedSize = cu.ThreadBlockSize*PmeOrder*(cu.getUseDoublePrecision() ? sizeof(double4) : sizeof(float4));
-//            }
+        cu.executeKernel(pmeUpdateBsplinesKernel, bsplinesArgs, cu.getNumAtoms(), cu.ThreadBlockSize, bsplinesSharedSize);
-//       }
+        sort->sort(*pmeAtomGridIndex);
-//    }
+        void* rangeArgs[] = {&pmeAtomGridIndex->getDevicePointer(), &pmeAtomRange->getDevicePointer(), &cu.getPosq().getDevicePointer(),
-//    if (cosSinSums != NULL && cu.getContextIndex() == 0 && includeReciprocal) {
+                cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()};
-//        mm_float4 boxSize = cu.getPeriodicBoxSize();
+        cu.executeKernel(pmeAtomRangeKernel, rangeArgs, cu.getNumAtoms());
-//        mm_float4 recipBoxSize = mm_float4((float) (2*M_PI/boxSize.x), (float) (2*M_PI/boxSize.y), (float) (2*M_PI/boxSize.z), 0);
+        void* spreadArgs[] = {&cu.getPosq().getDevicePointer(), &pmeGrid->getDevicePointer(), &pmeBsplineTheta->getDevicePointer(),
-//        float recipCoefficient = (float) (ONE_4PI_EPS0*4*M_PI/(boxSize.x*boxSize.y*boxSize.z));
+                cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()};
-//        ewaldSumsKernel.setArg<mm_float4>(3, recipBoxSize);
+        cu.executeKernel(pmeSpreadChargeKernel, spreadArgs, cu.getNumAtoms(), PmeOrder*PmeOrder*PmeOrder);
-//        ewaldSumsKernel.setArg<cl_float>(4, recipCoefficient);
+        void* finishSpreadArgs[] = {&pmeGrid->getDevicePointer()};
-//        cu.executeKernel(ewaldSumsKernel, cosSinSums->getSize());
+        cu.executeKernel(pmeFinishSpreadChargeKernel, finishSpreadArgs, pmeGrid->getSize());
-//        ewaldForcesKernel.setArg<mm_float4>(3, recipBoxSize);
+        cufftExecC2C(fft, (float2*) pmeGrid->getDevicePointer(), (float2*) pmeGrid->getDevicePointer(), CUFFT_FORWARD);
-//        ewaldForcesKernel.setArg<cl_float>(4, recipCoefficient);
+        void* convolutionArgs[] = {&pmeGrid->getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(), &pmeBsplineModuliX->getDevicePointer(),
-//        cu.executeKernel(ewaldForcesKernel, cu.getNumAtoms());
+                &pmeBsplineModuliY->getDevicePointer(), &pmeBsplineModuliZ->getDevicePointer(), cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()};
-//    }
+        cu.executeKernel(pmeConvolutionKernel, convolutionArgs, cu.getNumAtoms());
-//    if (pmeGrid != NULL && cu.getContextIndex() == 0 && includeReciprocal) {
+        cufftExecC2C(fft, (float2*) pmeGrid->getDevicePointer(), (float2*) pmeGrid->getDevicePointer(), CUFFT_INVERSE);
-//        mm_float4 boxSize = cu.getPeriodicBoxSize();
+        void* interpolateArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &pmeGrid->getDevicePointer(),
-//        mm_float4 invBoxSize = cu.getInvPeriodicBoxSize();
+                cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()};
-//        pmeUpdateBsplinesKernel.setArg<mm_float4>(4, boxSize);
+        interpolateForceThreads = 64;
-//        pmeUpdateBsplinesKernel.setArg<mm_float4>(5, invBoxSize);
+        int interpolateSharedSize = 2*interpolateForceThreads*PmeOrder*(cu.getUseDoublePrecision() ? sizeof(double3) : sizeof(float3));
-//        cu.executeKernel(pmeUpdateBsplinesKernel, cu.getNumAtoms());
+        cu.executeKernel(pmeInterpolateForceKernel, interpolateArgs, cu.getNumAtoms(), interpolateForceThreads, interpolateSharedSize);
-//        if (deviceIsCpu) {
+    }
-//            pmeSpreadChargeKernel.setArg<mm_float4>(5, boxSize);
+    double energy = (includeReciprocal ? ewaldSelfEnergy : 0.0);
-//            pmeSpreadChargeKernel.setArg<mm_float4>(6, invBoxSize);
+    if (dispersionCoefficient != 0.0 && includeDirect) {
-//            cu.executeKernel(pmeSpreadChargeKernel, 2*cu.getDevice().getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>(), 1);
+        double4 boxSize = cu.getPeriodicBoxSize();
-//        }
+        energy += dispersionCoefficient/(boxSize.x*boxSize.y*boxSize.z);
-//        else {
+    }
-//            sort->sort(*pmeAtomGridIndex);
+    return energy;
-//            pmeAtomRangeKernel.setArg<mm_float4>(3, boxSize);
+}
-//            pmeAtomRangeKernel.setArg<mm_float4>(4, invBoxSize);
-//            cu.executeKernel(pmeAtomRangeKernel, cu.getNumAtoms());
+void CudaCalcNonbondedForceKernel::copyParametersToContext(ContextImpl& context, const NonbondedForce& force) {
-//            if (cu.getSupports64BitGlobalAtomics()) {
+    // Make sure the new parameters are acceptable.
-//                pmeSpreadChargeKernel.setArg<mm_float4>(5, boxSize);
-//                pmeSpreadChargeKernel.setArg<mm_float4>(6, invBoxSize);
+    cuCtxSetCurrent(cu.getContext());
-//                cu.executeKernel(pmeSpreadChargeKernel, cu.getNumAtoms(), PmeOrder*PmeOrder*PmeOrder);
+    if (force.getNumParticles() != cu.getNumAtoms())
-//                cu.executeKernel(pmeFinishSpreadChargeKernel, pmeGrid->getSize());
+        throw OpenMMException("updateParametersInContext: The number of particles has changed");
-//            }
+    if (!hasCoulomb || !hasLJ) {
-//            else {
+        for (int i = 0; i < force.getNumParticles(); i++) {
-//                pmeZIndexKernel.setArg<mm_float4>(2, boxSize);
+            double charge, sigma, epsilon;
-//                pmeZIndexKernel.setArg<mm_float4>(3, invBoxSize);
+            force.getParticleParameters(i, charge, sigma, epsilon);
-//                cu.executeKernel(pmeZIndexKernel, cu.getNumAtoms());
+            if (!hasCoulomb && charge != 0.0)
-//                cu.executeKernel(pmeSpreadChargeKernel, cu.getNumAtoms());
+                throw OpenMMException("updateParametersInContext: The nonbonded force kernel does not include Coulomb interactions, because all charges were originally 0");
-//            }
+            if (!hasLJ && epsilon != 0.0)
-//        }
+                throw OpenMMException("updateParametersInContext: The nonbonded force kernel does not include Lennard-Jones interactions, because all epsilons were originally 0");
-//        fft->execFFT(*pmeGrid, *pmeGrid2, true);
+        }
-//        pmeConvolutionKernel.setArg<mm_float4>(5, invBoxSize);
+    }
-//        pmeConvolutionKernel.setArg<cl_float>(6, (float) (1.0/(M_PI*boxSize.x*boxSize.y*boxSize.z)));
+    vector<int> exceptions;
-//        cu.executeKernel(pmeConvolutionKernel, cu.getNumAtoms());
+    for (int i = 0; i < force.getNumExceptions(); i++) {
-//        fft->execFFT(*pmeGrid2, *pmeGrid, false);
+        int particle1, particle2;
-//        pmeInterpolateForceKernel.setArg<mm_float4>(3, boxSize);
+        double chargeProd, sigma, epsilon;
-//        pmeInterpolateForceKernel.setArg<mm_float4>(4, invBoxSize);
+        force.getExceptionParameters(i, particle1, particle2, chargeProd, sigma, epsilon);
-//        cu.executeKernel(pmeInterpolateForceKernel, cu.getNumAtoms(), interpolateForceThreads);
+        if (exceptionAtoms.size() > exceptions.size() && make_pair(particle1, particle2) == exceptionAtoms[exceptions.size()])
-//    }
+            exceptions.push_back(i);
-//    double energy = (includeReciprocal ? ewaldSelfEnergy : 0.0);
+        else if (chargeProd != 0.0 || epsilon != 0.0)
-//    if (dispersionCoefficient != 0.0 && includeDirect) {
+            throw OpenMMException("updateParametersInContext: The set of non-excluded exceptions has changed");
-//        mm_float4 boxSize = cu.getPeriodicBoxSize();
+    }
-//        energy += dispersionCoefficient/(boxSize.x*boxSize.y*boxSize.z);
+    int numContexts = cu.getPlatformData().contexts.size();
-//    }
+    int startIndex = cu.getContextIndex()*exceptions.size()/numContexts;
-//    return energy;
+    int endIndex = (cu.getContextIndex()+1)*exceptions.size()/numContexts;
-//}
+    int numExceptions = endIndex-startIndex;
-//
-//void CudaCalcNonbondedForceKernel::copyParametersToContext(ContextImpl& context, const NonbondedForce& force) {
+    // Record the per-particle parameters.
-//    // Make sure the new parameters are acceptable.
-//    
+    CudaArray& posq = cu.getPosq();
-//    cuCtxSetCurrent(cu.getContext());
+    posq.download(cu.getPinnedBuffer());
-//    if (force.getNumParticles() != cu.getNumAtoms())
+    float4* posqf = (float4*) cu.getPinnedBuffer();
-//        throw OpenMMException("updateParametersInContext: The number of particles has changed");
+    double4* posqd = (double4*) cu.getPinnedBuffer();
-//    if (!hasCoulomb || !hasLJ) {
+    vector<float2> sigmaEpsilonVector(force.getNumParticles());
-//        for (int i = 0; i < force.getNumParticles(); i++) {
+    double sumSquaredCharges = 0.0;
-//            double charge, sigma, epsilon;
+    const vector<int>& order = cu.getAtomIndex();
-//            force.getParticleParameters(i, charge, sigma, epsilon);
+    for (int i = 0; i < force.getNumParticles(); i++) {
-//            if (!hasCoulomb && charge != 0.0)
+        int index = order[i];
-//                throw OpenMMException("updateParametersInContext: The nonbonded force kernel does not include Coulomb interactions, because all charges were originally 0");
+        double charge, sigma, epsilon;
-//            if (!hasLJ && epsilon != 0.0)
+        force.getParticleParameters(index, charge, sigma, epsilon);
-//                throw OpenMMException("updateParametersInContext: The nonbonded force kernel does not include Lennard-Jones interactions, because all epsilons were originally 0");
+        if (cu.getUseDoublePrecision())
-//        }
+            posqd[i].w = charge;
-//    }
+        else
-//    vector<int> exceptions;
+            posqf[i].w = (float) charge;
-//    for (int i = 0; i < force.getNumExceptions(); i++) {
+        sigmaEpsilonVector[index] = make_float2((float) (0.5*sigma), (float) (2.0*sqrt(epsilon)));
-//        int particle1, particle2;
+        sumSquaredCharges += charge*charge;
-//        double chargeProd, sigma, epsilon;
+    }
-//        force.getExceptionParameters(i, particle1, particle2, chargeProd, sigma, epsilon);
+    posq.upload(cu.getPinnedBuffer());
-//        if (exceptionAtoms.size() > exceptions.size() && make_pair(particle1, particle2) == exceptionAtoms[exceptions.size()])
+    sigmaEpsilon->upload(sigmaEpsilonVector);
-//            exceptions.push_back(i);
-//        else if (chargeProd != 0.0 || epsilon != 0.0)
+    // Record the exceptions.
-//            throw OpenMMException("updateParametersInContext: The set of non-excluded exceptions has changed");
-//    }
+    if (numExceptions > 0) {
-//    int numContexts = cu.getPlatformData().contexts.size();
+        vector<vector<int> > atoms(numExceptions, vector<int>(2));
-//    int startIndex = cu.getContextIndex()*exceptions.size()/numContexts;
+        vector<float4> exceptionParamsVector(numExceptions);
-//    int endIndex = (cu.getContextIndex()+1)*exceptions.size()/numContexts;
+        for (int i = 0; i < numExceptions; i++) {
-//    int numExceptions = endIndex-startIndex;
+            double chargeProd, sigma, epsilon;
-//    
+            force.getExceptionParameters(exceptions[startIndex+i], atoms[i][0], atoms[i][1], chargeProd, sigma, epsilon);
-//    // Record the per-particle parameters.
+            exceptionParamsVector[i] = make_float4((float) (ONE_4PI_EPS0*chargeProd), (float) sigma, (float) (4.0*epsilon), 0.0f);
-//    
+        }
-//    CudaArray<mm_float4>& posq = cu.getPosq();
+        exceptionParams->upload(exceptionParamsVector);
-//    posq.download();
+    }
-//    vector<mm_float2> sigmaEpsilonVector(force.getNumParticles());
-//    double sumSquaredCharges = 0.0;
+    // Compute other values.
-//    CudaArray<cl_int>& order = cu.getAtomIndex();
-//    for (int i = 0; i < force.getNumParticles(); i++) {
+    NonbondedForce::NonbondedMethod method = force.getNonbondedMethod();
-//        int index = order[i];
+    if (method == NonbondedForce::Ewald || method == NonbondedForce::PME)
-//        double charge, sigma, epsilon;
+        ewaldSelfEnergy = (cu.getContextIndex() == 0 ? -ONE_4PI_EPS0*alpha*sumSquaredCharges/sqrt(M_PI) : 0.0);
-//        force.getParticleParameters(index, charge, sigma, epsilon);
+    if (force.getUseDispersionCorrection() && cu.getContextIndex() == 0 && (method == NonbondedForce::CutoffPeriodic || method == NonbondedForce::Ewald || method == NonbondedForce::PME))
-//        posq[i].w = (float) charge;
+        dispersionCoefficient = NonbondedForceImpl::calcDispersionCorrection(context.getSystem(), force);
-//        sigmaEpsilonVector[index] = mm_float2((float) (0.5*sigma), (float) (2.0*sqrt(epsilon)));
+    cu.invalidateMolecules();
-//        sumSquaredCharges += charge*charge;
+}
-//    }
-//    posq.upload();
-//    sigmaEpsilon->upload(sigmaEpsilonVector);
-//    
-//    // Record the exceptions.
-//    
-//    if (numExceptions > 0) {
-//        vector<vector<int> > atoms(numExceptions, vector<int>(2));
-//        vector<mm_float4> exceptionParamsVector(numExceptions);
-//        for (int i = 0; i < numExceptions; i++) {
-//            double chargeProd, sigma, epsilon;
-//            force.getExceptionParameters(exceptions[startIndex+i], atoms[i][0], atoms[i][1], chargeProd, sigma, epsilon);
-//            exceptionParamsVector[i] = mm_float4((float) (ONE_4PI_EPS0*chargeProd), (float) sigma, (float) (4.0*epsilon), 0.0f);
-//        }
-//        exceptionParams->upload(exceptionParamsVector);
-//    }
-//    
-//    // Compute other values.
-//    
-//    NonbondedForce::NonbondedMethod method = force.getNonbondedMethod();
-//    if (method == NonbondedForce::Ewald || method == NonbondedForce::PME)
-//        ewaldSelfEnergy = (cu.getContextIndex() == 0 ? -ONE_4PI_EPS0*alpha*sumSquaredCharges/sqrt(M_PI) : 0.0);
-//    if (force.getUseDispersionCorrection() && cu.getContextIndex() == 0 && (method == NonbondedForce::CutoffPeriodic || method == NonbondedForce::Ewald || method == NonbondedForce::PME))
-//        dispersionCoefficient = NonbondedForceImpl::calcDispersionCorrection(context.getSystem(), force);
-//    cu.invalidateMolecules();
-//}
-//
 //class CudaCustomNonbondedForceInfo : public CudaForceInfo {
 //public:
 //    CudaCustomNonbondedForceInfo(int requiredBuffers, const CustomNonbondedForce& force) : CudaForceInfo(requiredBuffers), force(force) {
@@ -1716,6 +1702,7 @@ void CudaCalcCustomTorsionForceKernel::copyParametersToContext(ContextImpl& cont
 //};
 //
 //CudaCalcCustomNonbondedForceKernel::~CudaCalcCustomNonbondedForceKernel() {
+//    cuCtxSetCurrent(cu.getContext());
 //    if (params != NULL)
 //        delete params;
 //    if (globals != NULL)
@@ -1727,6 +1714,7 @@ void CudaCalcCustomTorsionForceKernel::copyParametersToContext(ContextImpl& cont
 //}
 //
 //void CudaCalcCustomNonbondedForceKernel::initialize(const System& system, const CustomNonbondedForce& force) {
+//    cuCtxSetCurrent(cu.getContext());
 //    int forceIndex;
 //    for (forceIndex = 0; forceIndex < system.getNumForces() && &system.getForce(forceIndex) != &force; ++forceIndex)
 //        ;
@@ -1887,6 +1875,7 @@ void CudaCalcCustomTorsionForceKernel::copyParametersToContext(ContextImpl& cont
 //};
 //
 //CudaCalcGBSAOBCForceKernel::~CudaCalcGBSAOBCForceKernel() {
+//    cuCtxSetCurrent(cu.getContext());
 //    if (params != NULL)
 //        delete params;
 //    if (bornSum != NULL)
@@ -1904,6 +1893,7 @@ void CudaCalcCustomTorsionForceKernel::copyParametersToContext(ContextImpl& cont
 //}
 //
 //void CudaCalcGBSAOBCForceKernel::initialize(const System& system, const GBSAOBCForce& force) {
+//    cuCtxSetCurrent(cu.getContext());
 //    if (cu.getPlatformData().contexts.size() > 1)
 //        throw OpenMMException("GBSAOBCForce does not support using multiple CUDA devices");
 //    CudaNonbondedUtilities& nb = cu.getNonbondedUtilities();
@@ -1977,10 +1967,10 @@ void CudaCalcCustomTorsionForceKernel::copyParametersToContext(ContextImpl& cont
 //            file = CudaKernelSources::gbsaObc_nvidia;
 //        else
 //            file = CudaKernelSources::gbsaObc_default;
-//        cu::Program program = cu.createProgram(file, defines);
+//        CUmodule module = cu.createModule(file, defines);
 //        bool useLong = (cu.getSupports64BitGlobalAtomics() && !deviceIsCpu);
 //        int index = 0;
-//        computeBornSumKernel = cu::Kernel(program, "computeBornSum");
+//        computeBornSumKernel = cu.getKernel(module, "computeBornSum");
 //        computeBornSumKernel.setArg<cu::Buffer>(index++, (useLong ? longBornSum->getDevicePointer() : bornSum->getDevicePointer()));
 //        computeBornSumKernel.setArg<cu::Buffer>(index++, cu.getPosq().getDevicePointer());
 //        computeBornSumKernel.setArg<cu::Buffer>(index++, params->getDevicePointer());
@@ -1998,7 +1988,7 @@ void CudaCalcCustomTorsionForceKernel::copyParametersToContext(ContextImpl& cont
 //            computeBornSumKernel.setArg<cu::Buffer>(index++, nb.getExclusionIndices().getDevicePointer());
 //            computeBornSumKernel.setArg<cu::Buffer>(index++, nb.getExclusionRowIndices().getDevicePointer());
 //        }
-//        force1Kernel = cu::Kernel(program, "computeGBSAForce1");
+//        force1Kernel = cu.getKernel(module, "computeGBSAForce1");
 //        index = 0;
 //        force1Kernel.setArg<cu::Buffer>(index++, (useLong ? cu.getLongForceBuffer().getDevicePointer() : cu.getForceBuffers().getDevicePointer()));
 //        force1Kernel.setArg<cu::Buffer>(index++, (useLong ? longBornForce->getDevicePointer() : bornForce->getDevicePointer()));
@@ -2019,8 +2009,8 @@ void CudaCalcCustomTorsionForceKernel::copyParametersToContext(ContextImpl& cont
 //            force1Kernel.setArg<cu::Buffer>(index++, nb.getExclusionIndices().getDevicePointer());
 //            force1Kernel.setArg<cu::Buffer>(index++, nb.getExclusionRowIndices().getDevicePointer());
 //        }
-//        program = cu.createProgram(CudaKernelSources::gbsaObcReductions, defines);
+//        module = cu.createModule(CudaKernelSources::gbsaObcReductions, defines);
-//        reduceBornSumKernel = cu::Kernel(program, "reduceBornSum");
+//        reduceBornSumKernel = cu.getKernel(module, "reduceBornSum");
 //        reduceBornSumKernel.setArg<cl_int>(0, cu.getPaddedNumAtoms());
 //        reduceBornSumKernel.setArg<cl_int>(1, nb.getNumForceBuffers());
 //        reduceBornSumKernel.setArg<cl_float>(2, 1.0f);
@@ -2030,7 +2020,7 @@ void CudaCalcCustomTorsionForceKernel::copyParametersToContext(ContextImpl& cont
 //        reduceBornSumKernel.setArg<cu::Buffer>(6, params->getDevicePointer());
 //        reduceBornSumKernel.setArg<cu::Buffer>(7, bornRadii->getDevicePointer());
 //        reduceBornSumKernel.setArg<cu::Buffer>(8, obcChain->getDevicePointer());
-//        reduceBornForceKernel = cu::Kernel(program, "reduceBornForce");
+//        reduceBornForceKernel = cu.getKernel(module, "reduceBornForce");
 //        index = 0;
 //        reduceBornForceKernel.setArg<cl_int>(index++, cu.getPaddedNumAtoms());
 //        reduceBornForceKernel.setArg<cl_int>(index++, nb.getNumForceBuffers());
@@ -2127,6 +2117,7 @@ void CudaCalcCustomTorsionForceKernel::copyParametersToContext(ContextImpl& cont
 //};
 //
 //CudaCalcCustomGBForceKernel::~CudaCalcCustomGBForceKernel() {
+//    cuCtxSetCurrent(cu.getContext());
 //    if (params != NULL)
 //        delete params;
 //    if (computedValues != NULL)
@@ -2148,6 +2139,7 @@ void CudaCalcCustomTorsionForceKernel::copyParametersToContext(ContextImpl& cont
 //}
 //
 //void CudaCalcCustomGBForceKernel::initialize(const System& system, const CustomGBForce& force) {
+//    cuCtxSetCurrent(cu.getContext());
 //    if (cu.getPlatformData().contexts.size() > 1)
 //        throw OpenMMException("CustomGBForce does not support using multiple CUDA devices");
 //    bool useExclusionsForValue = false;
@@ -2360,8 +2352,8 @@ void CudaCalcCustomTorsionForceKernel::copyParametersToContext(ContextImpl& cont
 //            file = CudaKernelSources::customGBValueN2_nvidia;
 //        else
 //            file = CudaKernelSources::customGBValueN2_default;
-//        cu::Program program = cu.createProgram(cu.replaceStrings(file, replacements), defines);
+//        CUmodule module = cu.createModule(cu.replaceStrings(file, replacements), defines);
-//        pairValueKernel = cu::Kernel(program, "computeN2Value");
+//        pairValueKernel = cu.getKernel(module, "computeN2Value");
 //        if (useExclusionsForValue)
 //            cu.getNonbondedUtilities().requestExclusions(exclusionList);
 //    }
@@ -2406,8 +2398,8 @@ void CudaCalcCustomTorsionForceKernel::copyParametersToContext(ContextImpl& cont
 //        replacements["COMPUTE_VALUES"] = reductionSource.str();
 //        map<string, string> defines;
 //        defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
-//        cu::Program program = cu.createProgram(cu.replaceStrings(CudaKernelSources::customGBValuePerParticle, replacements), defines);
+//        CUmodule module = cu.createModule(cu.replaceStrings(CudaKernelSources::customGBValuePerParticle, replacements), defines);
-//        perParticleValueKernel = cu::Kernel(program, "computePerParticleValues");
+//        perParticleValueKernel = cu.getKernel(module, "computePerParticleValues");
 //    }
 //    {
 //        // Create the N2 energy kernel.
@@ -2559,8 +2551,8 @@ void CudaCalcCustomTorsionForceKernel::copyParametersToContext(ContextImpl& cont
 //            file = CudaKernelSources::customGBEnergyN2_nvidia;
 //        else
 //            file = CudaKernelSources::customGBEnergyN2_default;
-//        cu::Program program = cu.createProgram(cu.replaceStrings(file, replacements), defines);
+//        CUmodule module = cu.createModule(cu.replaceStrings(file, replacements), defines);
-//        pairEnergyKernel = cu::Kernel(program, "computeN2Energy");
+//        pairEnergyKernel = cu.getKernel(module, "computeN2Energy");
 //    }
 //    {
 //        // Create the kernel to reduce the derivatives and calculate per-particle energy terms.
@@ -2654,8 +2646,8 @@ void CudaCalcCustomTorsionForceKernel::copyParametersToContext(ContextImpl& cont
 //        map<string, string> defines;
 //        defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
 //        defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
-//        cu::Program program = cu.createProgram(cu.replaceStrings(CudaKernelSources::customGBEnergyPerParticle, replacements), defines);
+//        CUmodule module = cu.createModule(cu.replaceStrings(CudaKernelSources::customGBEnergyPerParticle, replacements), defines);
-//        perParticleEnergyKernel = cu::Kernel(program, "computePerParticleEnergy");
+//        perParticleEnergyKernel = cu.getKernel(module, "computePerParticleEnergy");
 //    }
 //    if (needParameterGradient) {
 //        // Create the kernel to compute chain rule terms for computed values that depend explicitly on particle coordinates.
@@ -2719,8 +2711,8 @@ void CudaCalcCustomTorsionForceKernel::copyParametersToContext(ContextImpl& cont
 //        replacements["COMPUTE_FORCES"] = compute.str();
 //        map<string, string> defines;
 //        defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
-//        cu::Program program = cu.createProgram(cu.replaceStrings(CudaKernelSources::customGBGradientChainRule, replacements), defines);
+//        CUmodule module = cu.createModule(cu.replaceStrings(CudaKernelSources::customGBGradientChainRule, replacements), defines);
-//        gradientChainRuleKernel = cu::Kernel(program, "computeGradientChainRuleTerms");
+//        gradientChainRuleKernel = cu.getKernel(module, "computeGradientChainRuleTerms");
 //    }
 //    {
 //        // Create the code to calculate chain rules terms as part of the default nonbonded kernel.
@@ -3061,6 +3053,7 @@ private:
 };
 CudaCalcCustomExternalForceKernel::~CudaCalcCustomExternalForceKernel() {
+    cuCtxSetCurrent(cu.getContext());
    if (params != NULL)
        delete params;
    if (globals != NULL)
@@ -3068,6 +3061,7 @@ CudaCalcCustomExternalForceKernel::~CudaCalcCustomExternalForceKernel() {
 }
 void CudaCalcCustomExternalForceKernel::initialize(const System& system, const CustomExternalForce& force) {
+    cuCtxSetCurrent(cu.getContext());
    int numContexts = cu.getPlatformData().contexts.size();
    int startIndex = cu.getContextIndex()*force.getNumParticles()/numContexts;
    int endIndex = (cu.getContextIndex()+1)*force.getNumParticles()/numContexts;
@@ -3127,7 +3121,7 @@ void CudaCalcCustomExternalForceKernel::initialize(const System& system, const C
    }
    stringstream compute;
    for (int i = 0; i < (int) params->getBuffers().size(); i++) {
-        const CudaNonbondedUtilities::ParameterInfo& buffer = params->getBuffers()[i];
+        CudaNonbondedUtilities::ParameterInfo& buffer = params->getBuffers()[i];
        string argName = cu.getBondedUtilities().addArgument(buffer.getMemory(), buffer.getType());
        compute<<buffer.getType()<<" particleParams"<<(i+1)<<" = "<<argName<<"[index];\n";
    }
@@ -3256,6 +3250,7 @@ void CudaCalcCustomExternalForceKernel::copyParametersToContext(ContextImpl& con
 //};
 //
 //CudaCalcCustomHbondForceKernel::~CudaCalcCustomHbondForceKernel() {
+//    cuCtxSetCurrent(cu.getContext());
 //    if (donorParams != NULL)
 //        delete donorParams;
 //    if (acceptorParams != NULL)
@@ -3296,6 +3291,7 @@ void CudaCalcCustomExternalForceKernel::copyParametersToContext(ContextImpl& con
 //void CudaCalcCustomHbondForceKernel::initialize(const System& system, const CustomHbondForce& force) {
 //    // Record the lists of donors and acceptors, and the parameters for each one.
 //
+//    cuCtxSetCurrent(cu.getContext());
 //    int numContexts = cu.getPlatformData().contexts.size();
 //    int startIndex = cu.getContextIndex()*force.getNumDonors()/numContexts;
 //    int endIndex = (cu.getContextIndex()+1)*force.getNumDonors()/numContexts;
@@ -3608,9 +3604,9 @@ void CudaCalcCustomExternalForceKernel::copyParametersToContext(ContextImpl& con
 //        defines["USE_PERIODIC"] = "1";
 //    if (force.getNumExclusions() > 0)
 //        defines["USE_EXCLUSIONS"] = "1";
-//    cu::Program program = cu.createProgram(cu.replaceStrings(CudaKernelSources::customHbondForce, replacements), defines);
+//    CUmodule module = cu.createModule(cu.replaceStrings(CudaKernelSources::customHbondForce, replacements), defines);
-//    donorKernel = cu::Kernel(program, "computeDonorForces");
+//    donorKernel = cu.getKernel(module, "computeDonorForces");
-//    acceptorKernel = cu::Kernel(program, "computeAcceptorForces");
+//    acceptorKernel = cu.getKernel(module, "computeAcceptorForces");
 //}
 //
 //double CudaCalcCustomHbondForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
@@ -3755,6 +3751,7 @@ private:
 };
 CudaCalcCustomCompoundBondForceKernel::~CudaCalcCustomCompoundBondForceKernel() {
+    cuCtxSetCurrent(cu.getContext());
    if (params != NULL)
        delete params;
    if (globals != NULL)
@@ -3766,6 +3763,7 @@ CudaCalcCustomCompoundBondForceKernel::~CudaCalcCustomCompoundBondForceKernel()
 }
 void CudaCalcCustomCompoundBondForceKernel::initialize(const System& system, const CustomCompoundBondForce& force) {
+    cuCtxSetCurrent(cu.getContext());
    int numContexts = cu.getPlatformData().contexts.size();
    int startIndex = cu.getContextIndex()*force.getNumBonds()/numContexts;
    int endIndex = (cu.getContextIndex()+1)*force.getNumBonds()/numContexts;
@@ -3922,7 +3920,7 @@ void CudaCalcCustomCompoundBondForceKernel::initialize(const System& system, con
    // Now evaluate the expressions.
    for (int i = 0; i < (int) params->getBuffers().size(); i++) {
-        const CudaNonbondedUtilities::ParameterInfo& buffer = params->getBuffers()[i];
+        CudaNonbondedUtilities::ParameterInfo& buffer = params->getBuffers()[i];
        string argName = cu.getBondedUtilities().addArgument(buffer.getMemory(), buffer.getType());
        compute<<buffer.getType()<<" bondParams"<<(i+1)<<" = "<<argName<<"[index];\n";
    }
@@ -4051,6 +4049,7 @@ CudaIntegrateVerletStepKernel::~CudaIntegrateVerletStepKernel() {
 }
 void CudaIntegrateVerletStepKernel::initialize(const System& system, const VerletIntegrator& integrator) {
+    cuCtxSetCurrent(cu.getContext());
    cu.getPlatformData().initializeContexts(system);
    map<string, string> defines;
    defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
@@ -4103,19 +4102,21 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn
 }
 //CudaIntegrateLangevinStepKernel::~CudaIntegrateLangevinStepKernel() {
+//    cuCtxSetCurrent(cu.getContext());
 //    if (params != NULL)
 //        delete params;
 //}
 //
 //void CudaIntegrateLangevinStepKernel::initialize(const System& system, const LangevinIntegrator& integrator) {
+//    cuCtxSetCurrent(cu.getContext());
 //    cu.getPlatformData().initializeContexts(system);
 //    cu.getIntegrationUtilities().initRandomNumberGenerator(integrator.getRandomNumberSeed());
 //    map<string, string> defines;
 //    defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
 //    defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
-//    cu::Program program = cu.createProgram(CudaKernelSources::langevin, defines, "");
+//    CUmodule module = cu.createModule(CudaKernelSources::langevin, defines, "");
-//    kernel1 = cu::Kernel(program, "integrateLangevinPart1");
+//    kernel1 = cu.getKernel(module, "integrateLangevinPart1");
-//    kernel2 = cu::Kernel(program, "integrateLangevinPart2");
+//    kernel2 = cu.getKernel(module, "integrateLangevinPart2");
 //    params = new CudaArray<cl_float>(cu, 3, "langevinParams");
 //    prevStepSize = -1.0;
 //}
@@ -4183,13 +4184,14 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn
 //}
 //
 //void CudaIntegrateBrownianStepKernel::initialize(const System& system, const BrownianIntegrator& integrator) {
+//    cuCtxSetCurrent(cu.getContext());
 //    cu.getPlatformData().initializeContexts(system);
 //    cu.getIntegrationUtilities().initRandomNumberGenerator(integrator.getRandomNumberSeed());
 //    map<string, string> defines;
 //    defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
-//    cu::Program program = cu.createProgram(CudaKernelSources::brownian, defines, "");
+//    CUmodule module = cu.createModule(CudaKernelSources::brownian, defines, "");
-//    kernel1 = cu::Kernel(program, "integrateBrownianPart1");
+//    kernel1 = cu.getKernel(module, "integrateBrownianPart1");
-//    kernel2 = cu::Kernel(program, "integrateBrownianPart2");
+//    kernel2 = cu.getKernel(module, "integrateBrownianPart2");
 //    prevStepSize = -1.0;
 //}
 //
@@ -4243,11 +4245,12 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn
 //}
 //
 //void CudaIntegrateVariableVerletStepKernel::initialize(const System& system, const VariableVerletIntegrator& integrator) {
+//    cuCtxSetCurrent(cu.getContext());
 //    cu.getPlatformData().initializeContexts(system);
-//    cu::Program program = cu.createProgram(CudaKernelSources::verlet, "");
+//    CUmodule module = cu.createModule(CudaKernelSources::verlet, "");
-//    kernel1 = cu::Kernel(program, "integrateVerletPart1");
+//    kernel1 = cu.getKernel(module, "integrateVerletPart1");
-//    kernel2 = cu::Kernel(program, "integrateVerletPart2");
+//    kernel2 = cu.getKernel(module, "integrateVerletPart2");
-//    selectSizeKernel = cu::Kernel(program, "selectVerletStepSize");
+//    selectSizeKernel = cu.getKernel(module, "selectVerletStepSize");
 //    blockSize = min(min(256, system.getNumParticles()), (int) cu.getDevice().getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>());
 //}
 //
@@ -4307,20 +4310,22 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn
 //}
 //
 //CudaIntegrateVariableLangevinStepKernel::~CudaIntegrateVariableLangevinStepKernel() {
+//    cuCtxSetCurrent(cu.getContext());
 //    if (params != NULL)
 //        delete params;
 //}
 //
 //void CudaIntegrateVariableLangevinStepKernel::initialize(const System& system, const VariableLangevinIntegrator& integrator) {
+//    cuCtxSetCurrent(cu.getContext());
 //    cu.getPlatformData().initializeContexts(system);
 //    cu.getIntegrationUtilities().initRandomNumberGenerator(integrator.getRandomNumberSeed());
 //    map<string, string> defines;
 //    defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
 //    defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
-//    cu::Program program = cu.createProgram(CudaKernelSources::langevin, defines, "");
+//    CUmodule module = cu.createModule(CudaKernelSources::langevin, defines, "");
-//    kernel1 = cu::Kernel(program, "integrateLangevinPart1");
+//    kernel1 = cu.getKernel(module, "integrateLangevinPart1");
-//    kernel2 = cu::Kernel(program, "integrateLangevinPart2");
+//    kernel2 = cu.getKernel(module, "integrateLangevinPart2");
-//    selectSizeKernel = cu::Kernel(program, "selectLangevinStepSize");
+//    selectSizeKernel = cu.getKernel(module, "selectLangevinStepSize");
 //    params = new CudaArray<cl_float>(cu, 3, "langevinParams");
 //    blockSize = min(256, system.getNumParticles());
 //    blockSize = max(blockSize, params->getSize());
@@ -4428,6 +4433,7 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn
 //};
 //
 //CudaIntegrateCustomStepKernel::~CudaIntegrateCustomStepKernel() {
+//    cuCtxSetCurrent(cu.getContext());
 //    if (globalValues != NULL)
 //        delete globalValues;
 //    if (contextParameterValues != NULL)
@@ -4445,6 +4451,7 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn
 //}
 //
 //void CudaIntegrateCustomStepKernel::initialize(const System& system, const CustomIntegrator& integrator) {
+//    cuCtxSetCurrent(cu.getContext());
 //    cu.getPlatformData().initializeContexts(system);
 //    cu.getIntegrationUtilities().initRandomNumberGenerator(integrator.getRandomNumberSeed());
 //    numGlobalVariables = integrator.getNumGlobalVariables();
@@ -4565,8 +4572,8 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn
 //            seed[i].w = r = (1664525*r + 1013904223) & 0xFFFFFFFF;
 //        }
 //        randomSeed->upload(seed);
-//        cu::Program randomProgram = cu.createProgram(CudaKernelSources::customIntegrator, defines);
+//        CUmodule randomProgram = cu.createModule(CudaKernelSources::customIntegrator, defines);
-//        randomKernel = cu::Kernel(randomProgram, "generateRandomNumbers");
+//        randomKernel = cu.getKernel(randomProgram, "generateRandomNumbers");
 //        randomKernel.setArg<cu::Buffer>(0, uniformRandoms->getDevicePointer());
 //        randomKernel.setArg<cu::Buffer>(1, randomSeed->getDevicePointer());
 //        
@@ -4721,8 +4728,8 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn
 //                    defines["LOAD_POS_AS_DELTA"] = "1";
 //                else if (defines.find("LOAD_POS_AS_DELTA") != defines.end())
 //                    defines.erase("LOAD_POS_AS_DELTA");
-//                cu::Program program = cu.createProgram(cu.replaceStrings(CudaKernelSources::customIntegratorPerDof, replacements), defines);
+//                CUmodule module = cu.createModule(cu.replaceStrings(CudaKernelSources::customIntegratorPerDof, replacements), defines);
-//                cu::Kernel kernel = cu::Kernel(program, "computePerDof");
+//                cu::Kernel kernel = cu.getKernel(module, "computePerDof");
 //                kernels[step].push_back(kernel);
 //                requiredGaussian[step] = numGaussian;
 //                requiredUniform[step] = numUniform;
@@ -4744,8 +4751,8 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn
 //                if (stepType[step] == CustomIntegrator::ComputeSum) {
 //                    // Create a second kernel for this step that sums the values.
 //
-//                    program = cu.createProgram(CudaKernelSources::customIntegrator, defines);
+//                    module = cu.createModule(CudaKernelSources::customIntegrator, defines);
-//                    kernel = cu::Kernel(program, "computeSum");
+//                    kernel = cu.getKernel(module, "computeSum");
 //                    kernels[step].push_back(kernel);
 //                    index = 0;
 //                    kernel.setArg<cu::Buffer>(index++, sumBuffer->getDevicePointer());
@@ -4776,8 +4783,8 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn
 //                    compute << "{\n" << createGlobalComputation(variable[i], expression[i], integrator, energyName[i]) << "}\n";
 //                map<string, string> replacements;
 //                replacements["COMPUTE_STEP"] = compute.str();
-//                cu::Program program = cu.createProgram(cu.replaceStrings(CudaKernelSources::customIntegratorGlobal, replacements), defines);
+//                CUmodule module = cu.createModule(cu.replaceStrings(CudaKernelSources::customIntegratorGlobal, replacements), defines);
-//                cu::Kernel kernel = cu::Kernel(program, "computeGlobal");
+//                cu::Kernel kernel = cu.getKernel(module, "computeGlobal");
 //                kernels[step].push_back(kernel);
 //                int index = 0;
 //                kernel.setArg<cu::Buffer>(index++, integration.getStepSize().getDevicePointer());
@@ -4789,8 +4796,8 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn
 //            else if (stepType[step] == CustomIntegrator::ConstrainPositions) {
 //                // Apply position constraints.
 //
-//                cu::Program program = cu.createProgram(CudaKernelSources::customIntegrator, defines);
+//                CUmodule module = cu.createModule(CudaKernelSources::customIntegrator, defines);
-//                cu::Kernel kernel = cu::Kernel(program, "applyPositionDeltas");
+//                cu::Kernel kernel = cu.getKernel(module, "applyPositionDeltas");
 //                kernels[step].push_back(kernel);
 //                int index = 0;
 //                kernel.setArg<cu::Buffer>(index++, cu.getPosq().getDevicePointer());
@@ -4800,8 +4807,8 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn
 //        
 //        // Create the kernel for summing energy.
 //
-//        cu::Program program = cu.createProgram(CudaKernelSources::customIntegrator, defines);
+//        CUmodule module = cu.createModule(CudaKernelSources::customIntegrator, defines);
-//        sumEnergyKernel = cu::Kernel(program, "computeSum");
+//        sumEnergyKernel = cu.getKernel(module, "computeSum");
 //        int index = 0;
 //        sumEnergyKernel.setArg<cu::Buffer>(index++, cu.getEnergyBuffer().getDevicePointer());
 //        sumEnergyKernel.setArg<cu::Buffer>(index++, energy->getDevicePointer());
@@ -4949,16 +4956,18 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn
 //}
 //
 //CudaApplyAndersenThermostatKernel::~CudaApplyAndersenThermostatKernel() {
+//    cuCtxSetCurrent(cu.getContext());
 //    if (atomGroups != NULL)
 //        delete atomGroups;
 //}
 //
 //void CudaApplyAndersenThermostatKernel::initialize(const System& system, const AndersenThermostat& thermostat) {
+//    cuCtxSetCurrent(cu.getContext());
 //    randomSeed = thermostat.getRandomNumberSeed();
 //    map<string, string> defines;
 //    defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
-//    cu::Program program = cu.createProgram(CudaKernelSources::andersenThermostat, defines);
+//    CUmodule module = cu.createModule(CudaKernelSources::andersenThermostat, defines);
-//    kernel = cu::Kernel(program, "applyAndersenThermostat");
+//    kernel = cu.getKernel(module, "applyAndersenThermostat");
 //    cu.getIntegrationUtilities().initRandomNumberGenerator(randomSeed);
 //
 //    // Create the arrays with the group definitions.
@@ -4988,6 +4997,7 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn
 //}
 //
 //CudaApplyMonteCarloBarostatKernel::~CudaApplyMonteCarloBarostatKernel() {
+//    cuCtxSetCurrent(cu.getContext());
 //    if (savedPositions != NULL)
 //        delete savedPositions;
 //    if (moleculeAtoms != NULL)
@@ -4997,9 +5007,10 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn
 //}
 //
 //void CudaApplyMonteCarloBarostatKernel::initialize(const System& system, const MonteCarloBarostat& thermostat) {
+//    cuCtxSetCurrent(cu.getContext());
 //    savedPositions = new CudaArray<mm_float4>(cu, cu.getPaddedNumAtoms(), "savedPositions");
-//    cu::Program program = cu.createProgram(CudaKernelSources::monteCarloBarostat);
+//    CUmodule module = cu.createModule(CudaKernelSources::monteCarloBarostat);
-//    kernel = cu::Kernel(program, "scalePositions");
+//    kernel = cu.getKernel(module, "scalePositions");
 //}
 //
 //void CudaApplyMonteCarloBarostatKernel::scaleCoordinates(ContextImpl& context, double scale) {
@@ -5045,6 +5056,7 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn
 //}
 void CudaCalcKineticEnergyKernel::initialize(const System& system) {
+    cuCtxSetCurrent(cu.getContext());
    int numParticles = system.getNumParticles();
    masses.resize(numParticles);
    for (int i = 0; i < numParticles; ++i)
@@ -5077,11 +5089,13 @@ double CudaCalcKineticEnergyKernel::execute(ContextImpl& context) {
 }
 //CudaRemoveCMMotionKernel::~CudaRemoveCMMotionKernel() {
+//    cuCtxSetCurrent(cu.getContext());
 //    if (cmMomentum != NULL)
 //        delete cmMomentum;
 //}
 //
 //void CudaRemoveCMMotionKernel::initialize(const System& system, const CMMotionRemover& force) {
+//    cuCtxSetCurrent(cu.getContext());
 //    frequency = force.getFrequency();
 //    int numAtoms = cu.getNumAtoms();
 //    cmMomentum = new CudaArray<mm_float4>(cu, (numAtoms+CudaContext::ThreadBlockSize-1)/CudaContext::ThreadBlockSize, "cmMomentum");
@@ -5090,13 +5104,13 @@ double CudaCalcKineticEnergyKernel::execute(ContextImpl& context) {
 //        totalMass += system.getParticleMass(i);
 //    map<string, string> defines;
 //    defines["INVERSE_TOTAL_MASS"] = cu.doubleToString(1.0/totalMass);
-//    cu::Program program = cu.createProgram(CudaKernelSources::removeCM, defines);
+//    CUmodule module = cu.createModule(CudaKernelSources::removeCM, defines);
-//    kernel1 = cu::Kernel(program, "calcCenterOfMassMomentum");
+//    kernel1 = cu.getKernel(module, "calcCenterOfMassMomentum");
 //    kernel1.setArg<cl_int>(0, numAtoms);
 //    kernel1.setArg<cu::Buffer>(1, cu.getVelm().getDevicePointer());
 //    kernel1.setArg<cu::Buffer>(2, cmMomentum->getDevicePointer());
 //    kernel1.setArg(3, CudaContext::ThreadBlockSize*sizeof(mm_float4), NULL);
-//    kernel2 = cu::Kernel(program, "removeCenterOfMassMomentum");
+//    kernel2 = cu.getKernel(module, "removeCenterOfMassMomentum");
 //    kernel2.setArg<cl_int>(0, numAtoms);
 //    kernel2.setArg<cu::Buffer>(1, cu.getVelm().getDevicePointer());
 //    kernel2.setArg<cu::Buffer>(2, cmMomentum->getDevicePointer());

--- a/platforms/cuda2/src/CudaKernels.h
+++ b/platforms/cuda2/src/CudaKernels.h
@@ -34,6 +34,7 @@
 #include "CudaSort.h"
 #include "openmm/kernels.h"
 #include "openmm/System.h"
+#include <cufft.h>
 namespace OpenMM {
@@ -542,87 +543,86 @@ private:
    std::vector<float> globalParamValues;
 };
-///**
+/**
-// * This kernel is invoked by NonbondedForce to calculate the forces acting on the system.
+ * This kernel is invoked by NonbondedForce to calculate the forces acting on the system.
-// */
+ */
-//class CudaCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
+class CudaCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
-//public:
+public:
-//    CudaCalcNonbondedForceKernel(std::string name, const Platform& platform, CudaContext& cu, System& system) : CalcNonbondedForceKernel(name, platform),
+    CudaCalcNonbondedForceKernel(std::string name, const Platform& platform, CudaContext& cu, System& system) : CalcNonbondedForceKernel(name, platform),
-//            hasInitializedKernel(false), cu(cu), sigmaEpsilon(NULL), exceptionParams(NULL), cosSinSums(NULL), pmeGrid(NULL),
+            cu(cu), hasInitializedFFT(false), sigmaEpsilon(NULL), exceptionParams(NULL), cosSinSums(NULL), pmeGrid(NULL),
-//            pmeGrid2(NULL), pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeBsplineTheta(NULL), pmeBsplineDTheta(NULL),
+            pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeBsplineTheta(NULL), pmeBsplineDTheta(NULL),
-//            pmeAtomRange(NULL), pmeAtomGridIndex(NULL), sort(NULL), fft(NULL) {
+            pmeAtomRange(NULL), pmeAtomGridIndex(NULL), sort(NULL) {
-//    }
+    }
-//    ~CudaCalcNonbondedForceKernel();
+    ~CudaCalcNonbondedForceKernel();
-//    /**
+    /**
-//     * Initialize the kernel.
+     * Initialize the kernel.
-//     *
+     *
-//     * @param system     the System this kernel will be applied to
+     * @param system     the System this kernel will be applied to
-//     * @param force      the NonbondedForce this kernel will be used for
+     * @param force      the NonbondedForce this kernel will be used for
-//     */
+     */
-//    void initialize(const System& system, const NonbondedForce& force);
+    void initialize(const System& system, const NonbondedForce& force);
-//    /**
+    /**
-//     * Execute the kernel to calculate the forces and/or energy.
+     * Execute the kernel to calculate the forces and/or energy.
-//     *
+     *
-//     * @param context        the context in which to execute this kernel
+     * @param context        the context in which to execute this kernel
-//     * @param includeForces  true if forces should be calculated
+     * @param includeForces  true if forces should be calculated
-//     * @param includeEnergy  true if the energy should be calculated
+     * @param includeEnergy  true if the energy should be calculated
-//     * @param includeDirect  true if direct space interactions should be included
+     * @param includeDirect  true if direct space interactions should be included
-//     * @param includeReciprocal  true if reciprocal space interactions should be included
+     * @param includeReciprocal  true if reciprocal space interactions should be included
-//     * @return the potential energy due to the force
+     * @return the potential energy due to the force
-//     */
+     */
-//    double execute(ContextImpl& context, bool includeForces, bool includeEnergy, bool includeDirect, bool includeReciprocal);
+    double execute(ContextImpl& context, bool includeForces, bool includeEnergy, bool includeDirect, bool includeReciprocal);
-//    /**
+    /**
-//     * Copy changed parameters over to a context.
+     * Copy changed parameters over to a context.
-//     *
+     *
-//     * @param context    the context to copy parameters to
+     * @param context    the context to copy parameters to
-//     * @param force      the NonbondedForce to copy the parameters from
+     * @param force      the NonbondedForce to copy the parameters from
-//     */
+     */
-//    void copyParametersToContext(ContextImpl& context, const NonbondedForce& force);
+    void copyParametersToContext(ContextImpl& context, const NonbondedForce& force);
-//private:
+private:
-//    struct SortTrait {
+    class SortTrait : public CudaSort::SortTrait {
-//        typedef mm_int2 DataType;
+        int getDataSize() const {return 8;}
-//        typedef cl_int KeyType;
+        int getKeySize() const {return 4;}
-//        static const char* clDataType() {return "int2";}
+        const char* getDataType() const {return "int2";}
-//        static const char* clKeyType() {return "int";}
+        const char* getKeyType() const {return "int";}
-//        static const char* clMinKey() {return "INT_MIN";}
+        const char* getMinKey() const {return "INT_MIN";}
-//        static const char* clMaxKey() {return "INT_MAX";}
+        const char* getMaxKey() const {return "INT_MAX";}
-//        static const char* clMaxValue() {return "(int2) (INT_MAX, INT_MAX)";}
+        const char* getMaxValue() const {return "make_int2(INT_MAX, INT_MAX)";}
-//        static const char* clSortKey() {return "value.y";}
+        const char* getSortKey() const {return "value.y";}
-//    };
+    };
-//    CudaContext& cu;
+    CudaContext& cu;
-//    bool hasInitializedKernel;
+    bool hasInitializedFFT;
-//    CudaArray<mm_float2>* sigmaEpsilon;
+    CudaArray* sigmaEpsilon;
-//    CudaArray<mm_float4>* exceptionParams;
+    CudaArray* exceptionParams;
-//    CudaArray<mm_float2>* cosSinSums;
+    CudaArray* cosSinSums;
-//    CudaArray<mm_float2>* pmeGrid;
+    CudaArray* pmeGrid;
-//    CudaArray<mm_float2>* pmeGrid2;
+    CudaArray* pmeBsplineModuliX;
-//    CudaArray<cl_float>* pmeBsplineModuliX;
+    CudaArray* pmeBsplineModuliY;
-//    CudaArray<cl_float>* pmeBsplineModuliY;
+    CudaArray* pmeBsplineModuliZ;
-//    CudaArray<cl_float>* pmeBsplineModuliZ;
+    CudaArray* pmeBsplineTheta;
-//    CudaArray<mm_float4>* pmeBsplineTheta;
+    CudaArray* pmeBsplineDTheta;
-//    CudaArray<mm_float4>* pmeBsplineDTheta;
+    CudaArray* pmeAtomRange;
-//    CudaArray<cl_int>* pmeAtomRange;
+    CudaArray* pmeAtomGridIndex;
-//    CudaArray<mm_int2>* pmeAtomGridIndex;
+    CudaSort* sort;
-//    CudaSort<SortTrait>* sort;
+    cufftHandle fft;
-//    CudaFFT3D* fft;
+    CUfunction ewaldSumsKernel;
-//    CUfunction ewaldSumsKernel;
+    CUfunction ewaldForcesKernel;
-//    CUfunction ewaldForcesKernel;
+    CUfunction pmeGridIndexKernel;
-//    CUfunction pmeGridIndexKernel;
+    CUfunction pmeAtomRangeKernel;
-//    CUfunction pmeAtomRangeKernel;
+    CUfunction pmeZIndexKernel;
-//    CUfunction pmeZIndexKernel;
+    CUfunction pmeUpdateBsplinesKernel;
-//    CUfunction pmeUpdateBsplinesKernel;
+    CUfunction pmeSpreadChargeKernel;
-//    CUfunction pmeSpreadChargeKernel;
+    CUfunction pmeFinishSpreadChargeKernel;
-//    CUfunction pmeFinishSpreadChargeKernel;
+    CUfunction pmeConvolutionKernel;
-//    CUfunction pmeConvolutionKernel;
+    CUfunction pmeInterpolateForceKernel;
-//    CUfunction pmeInterpolateForceKernel;
+    std::map<std::string, std::string> pmeDefines;
-//    std::map<std::string, std::string> pmeDefines;
+    std::vector<std::pair<int, int> > exceptionAtoms;
-//    std::vector<std::pair<int, int> > exceptionAtoms;
+    double ewaldSelfEnergy, dispersionCoefficient, alpha;
-//    double ewaldSelfEnergy, dispersionCoefficient, alpha;
+    int interpolateForceThreads;
-//    int interpolateForceThreads;
+    bool hasCoulomb, hasLJ;
-//    bool hasCoulomb, hasLJ;
+    static const int PmeOrder = 5;
-//    static const int PmeOrder = 5;
+};
-//};
-//
 ///**
 // * This kernel is invoked by CustomNonbondedForce to calculate the forces acting on the system.
 // */

--- a/platforms/cuda2/src/CudaNonbondedUtilities.cpp
+++ b/platforms/cuda2/src/CudaNonbondedUtilities.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009-2012 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include "openmm/OpenMMException.h"
+#include "CudaNonbondedUtilities.h"
+#include "CudaArray.h"
+#include "CudaKernelSources.h"
+#include "CudaExpressionUtilities.h"
+#include <map>
+#include <set>
+#include <utility>
+using namespace OpenMM;
+using namespace std;
+#define CHECK_RESULT(result) \
+    if (result != CUDA_SUCCESS) { \
+        std::stringstream m; \
+        m<<errorMessage<<": "<<context.getErrorString(result)<<" ("<<result<<")"<<" at "<<__FILE__<<":"<<__LINE__; \
+        throw OpenMMException(m.str());\
+    }
+CudaNonbondedUtilities::CudaNonbondedUtilities(CudaContext& context) : context(context), cutoff(-1.0), useCutoff(false), anyExclusions(false),
+        exclusionIndices(NULL), exclusionRowIndices(NULL), exclusions(NULL), interactingTiles(NULL), interactionFlags(NULL),
+        interactionCount(NULL), blockCenter(NULL), blockBoundingBox(NULL), pinnedInteractionCount(NULL), nonbondedForceGroup(0) {
+    // Decide how many thread blocks to use.
+    string errorMessage = "Error initializing nonbonded utilities";
+    int multiprocessors;
+    CHECK_RESULT(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, context.getDevice()));
+    numForceThreadBlocks = 2*multiprocessors;
+    forceThreadBlockSize = 256;
+}
+CudaNonbondedUtilities::~CudaNonbondedUtilities() {
+    if (exclusionIndices != NULL)
+        delete exclusionIndices;
+    if (exclusionRowIndices != NULL)
+        delete exclusionRowIndices;
+    if (exclusions != NULL)
+        delete exclusions;
+    if (interactingTiles != NULL)
+        delete interactingTiles;
+    if (interactionFlags != NULL)
+        delete interactionFlags;
+    if (interactionCount != NULL)
+        delete interactionCount;
+    if (blockCenter != NULL)
+        delete blockCenter;
+    if (blockBoundingBox != NULL)
+        delete blockBoundingBox;
+    if (pinnedInteractionCount != NULL)
+        cuMemFreeHost(pinnedInteractionCount);
+}
+void CudaNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector<vector<int> >& exclusionList, const string& kernel, int forceGroup) {
+    if (cutoff != -1.0) {
+        if (usesCutoff != useCutoff)
+            throw OpenMMException("All Forces must agree on whether to use a cutoff");
+        if (usesPeriodic != usePeriodic)
+            throw OpenMMException("All Forces must agree on whether to use periodic boundary conditions");
+        if (cutoffDistance != cutoff)
+            throw OpenMMException("All Forces must use the same cutoff distance");
+        if (forceGroup != nonbondedForceGroup)
+            throw OpenMMException("All nonbonded forces must be in the same force group");
+    }
+    if (usesExclusions)
+        requestExclusions(exclusionList);
+    useCutoff = usesCutoff;
+    usePeriodic = usesPeriodic;
+    cutoff = cutoffDistance;
+    kernelSource += kernel+"\n";
+    nonbondedForceGroup = forceGroup;
+}
+void CudaNonbondedUtilities::addParameter(const ParameterInfo& parameter) {
+    parameters.push_back(parameter);
+}
+void CudaNonbondedUtilities::addArgument(const ParameterInfo& parameter) {
+    arguments.push_back(parameter);
+}
+void CudaNonbondedUtilities::requestExclusions(const vector<vector<int> >& exclusionList) {
+    if (anyExclusions) {
+        bool sameExclusions = (exclusionList.size() == atomExclusions.size());
+        for (int i = 0; i < (int) exclusionList.size() && sameExclusions; i++) {
+            if (exclusionList[i].size() != atomExclusions[i].size())
+                sameExclusions = false;
+            for (int j = 0; j < (int) exclusionList[i].size(); j++)
+                if (exclusionList[i][j] != atomExclusions[i][j])
+                    sameExclusions = false;
+        }
+        if (!sameExclusions)
+            throw OpenMMException("All Forces must have identical exceptions");
+    }
+    else {
+        atomExclusions = exclusionList;
+        anyExclusions = true;
+    }
+}
+void CudaNonbondedUtilities::initialize(const System& system) {
+    if (cutoff == -1.0)
+        return; // There are no nonbonded interactions in the System.
+    string errorMessage = "Error initializing nonbonded utilities";    
+    if (atomExclusions.size() == 0) {
+        // No exclusions were specifically requested, so just mark every atom as not interacting with itself.
+        atomExclusions.resize(context.getNumAtoms());
+        for (int i = 0; i < (int) atomExclusions.size(); i++)
+            atomExclusions[i].push_back(i);
+    }
+    // Create the list of tiles.
+    numAtoms = context.getNumAtoms();
+    int numAtomBlocks = context.getNumAtomBlocks();
+    int totalTiles = numAtomBlocks*(numAtomBlocks+1)/2;
+    int numContexts = context.getPlatformData().contexts.size();
+    startTileIndex = context.getContextIndex()*totalTiles/numContexts;
+    int endTileIndex = (context.getContextIndex()+1)*totalTiles/numContexts;
+    numTiles = endTileIndex-startTileIndex;
+    // Build a list of indices for the tiles with exclusions.
+    set<pair<int, int> > tilesWithExclusions;
+    for (int atom1 = 0; atom1 < (int) atomExclusions.size(); ++atom1) {
+        int x = atom1/CudaContext::TileSize;
+        for (int j = 0; j < (int) atomExclusions[atom1].size(); ++j) {
+            int atom2 = atomExclusions[atom1][j];
+            int y = atom2/CudaContext::TileSize;
+            tilesWithExclusions.insert(make_pair(max(x, y), min(x, y)));
+        }
+    }
+    if (context.getPaddedNumAtoms() > context.getNumAtoms()) {
+        for (int i = 0; i < numAtomBlocks; ++i)
+            tilesWithExclusions.insert(make_pair(numAtomBlocks-1, i));
+    }
+    vector<unsigned int> exclusionRowIndicesVec(numAtomBlocks+1, 0);
+    vector<unsigned int> exclusionIndicesVec;
+    int currentRow = 0;
+    for (set<pair<int, int> >::const_iterator iter = tilesWithExclusions.begin(); iter != tilesWithExclusions.end(); ++iter) {
+        while (iter->first != currentRow)
+            exclusionRowIndicesVec[++currentRow] = exclusionIndicesVec.size();
+        exclusionIndicesVec.push_back(iter->second);
+    }
+    exclusionRowIndicesVec[++currentRow] = exclusionIndicesVec.size();
+    exclusionIndices = CudaArray::create<unsigned int>(exclusionIndicesVec.size(), "exclusionIndices");
+    exclusionRowIndices = CudaArray::create<unsigned int>(exclusionRowIndicesVec.size(), "exclusionRowIndices");
+    exclusionIndices->upload(exclusionIndicesVec);
+    exclusionRowIndices->upload(exclusionRowIndicesVec);
+    // Record the exclusion data.
+    exclusions = CudaArray::create<unsigned int>(tilesWithExclusions.size()*CudaContext::TileSize, "exclusions");
+    vector<unsigned int> exclusionVec(exclusions->getSize());
+    for (int i = 0; i < exclusions->getSize(); ++i)
+        exclusionVec[i] = 0xFFFFFFFF;
+    for (int atom1 = 0; atom1 < (int) atomExclusions.size(); ++atom1) {
+        int x = atom1/CudaContext::TileSize;
+        int offset1 = atom1-x*CudaContext::TileSize;
+        for (int j = 0; j < (int) atomExclusions[atom1].size(); ++j) {
+            int atom2 = atomExclusions[atom1][j];
+            int y = atom2/CudaContext::TileSize;
+            int offset2 = atom2-y*CudaContext::TileSize;
+            if (x > y) {
+                int index = findExclusionIndex(x, y, exclusionIndicesVec, exclusionRowIndicesVec);
+                exclusionVec[index+offset1] &= 0xFFFFFFFF-(1<<offset2);
+            }
+            else {
+                int index = findExclusionIndex(y, x, exclusionIndicesVec, exclusionRowIndicesVec);
+                exclusionVec[index+offset2] &= 0xFFFFFFFF-(1<<offset1);
+            }
+        }
+    }
+    // Mark all interactions that involve a padding atom as being excluded.
+    for (int atom1 = context.getNumAtoms(); atom1 < context.getPaddedNumAtoms(); ++atom1) {
+        int x = atom1/CudaContext::TileSize;
+        int offset1 = atom1-x*CudaContext::TileSize;
+        for (int atom2 = 0; atom2 < context.getPaddedNumAtoms(); ++atom2) {
+            int y = atom2/CudaContext::TileSize;
+            int offset2 = atom2-y*CudaContext::TileSize;
+            if (x >= y) {
+                int index = findExclusionIndex(x, y, exclusionIndicesVec, exclusionRowIndicesVec);
+                exclusionVec[index+offset1] &= 0xFFFFFFFF-(1<<offset2);
+            }
+            if (y >= x) {
+                int index = findExclusionIndex(y, x, exclusionIndicesVec, exclusionRowIndicesVec);
+                exclusionVec[index+offset2] &= 0xFFFFFFFF-(1<<offset1);
+            }
+        }
+    }
+    atomExclusions.clear(); // We won't use this again, so free the memory it used
+    exclusions->upload(exclusionVec);
+    // Create data structures for the neighbor list.
+    if (useCutoff) {
+        // Select a size for the arrays that hold the neighbor list.  This estimate is intentionally very
+        // high, because if it ever is too small, we have to fall back to the N^2 algorithm.
+        double4 boxSize = context.getPeriodicBoxSize();
+        maxTiles = (int) (numTiles*(cutoff/boxSize.x+cutoff/boxSize.y+cutoff/boxSize.z));
+        if (maxTiles > numTiles)
+            maxTiles = numTiles;
+        if (maxTiles < 1)
+            maxTiles = 1;
+        interactingTiles = CudaArray::create<ushort2>(maxTiles, "interactingTiles");
+        interactionFlags = CudaArray::create<unsigned int>(maxTiles, "interactionFlags");
+        interactionCount = CudaArray::create<unsigned int>(1, "interactionCount");
+        blockCenter = CudaArray::create<float4>(numAtomBlocks, "blockCenter");
+        blockBoundingBox = CudaArray::create<float4>(numAtomBlocks, "blockBoundingBox");
+        CHECK_RESULT(cuMemHostAlloc((void**) &pinnedInteractionCount, sizeof(unsigned int), 0));
+        pinnedInteractionCount[0] = 0;
+        interactionCount->upload(pinnedInteractionCount);
+    }
+    // Create kernels.
+    forceKernel = createInteractionKernel(kernelSource, parameters, arguments, true, true);
+    if (useCutoff) {
+        map<string, string> defines;
+        defines["NUM_BLOCKS"] = context.intToString(context.getNumAtomBlocks());
+        defines["CUTOFF_SQUARED"] = context.doubleToString(cutoff*cutoff);
+        if (usePeriodic)
+            defines["USE_PERIODIC"] = "1";
+        CUmodule interactingBlocksProgram = context.createModule(CudaKernelSources::vectorOps+CudaKernelSources::findInteractingBlocks, defines);
+        findBlockBoundsKernel = context.getKernel(interactingBlocksProgram, "findBlockBounds");
+        findBlockBoundsArgs.push_back(&numAtoms);
+        findBlockBoundsArgs.push_back(context.getPeriodicBoxSizePointer());
+        findBlockBoundsArgs.push_back(context.getInvPeriodicBoxSizePointer());
+        findBlockBoundsArgs.push_back(&context.getPosq().getDevicePointer());
+        findBlockBoundsArgs.push_back(&blockCenter->getDevicePointer());
+        findBlockBoundsArgs.push_back(&blockBoundingBox->getDevicePointer());
+        findBlockBoundsArgs.push_back(&interactionCount->getDevicePointer());
+        findInteractingBlocksKernel = context.getKernel(interactingBlocksProgram, "findBlocksWithInteractions");
+        findInteractingBlocksArgs.push_back(context.getPeriodicBoxSizePointer());
+        findInteractingBlocksArgs.push_back(context.getInvPeriodicBoxSizePointer());
+        findInteractingBlocksArgs.push_back(&blockCenter->getDevicePointer());
+        findInteractingBlocksArgs.push_back(&blockBoundingBox->getDevicePointer());
+        findInteractingBlocksArgs.push_back(&interactionCount->getDevicePointer());
+        findInteractingBlocksArgs.push_back(&interactingTiles->getDevicePointer());
+        findInteractingBlocksArgs.push_back(&interactionFlags->getDevicePointer());
+        findInteractingBlocksArgs.push_back(&context.getPosq().getDevicePointer());
+        findInteractingBlocksArgs.push_back(&maxTiles);
+        findInteractingBlocksArgs.push_back(&startTileIndex);
+        findInteractingBlocksArgs.push_back(&numTiles);
+        findInteractionsWithinBlocksKernel = context.getKernel(interactingBlocksProgram, "findInteractionsWithinBlocks");
+        findInteractionsWithinBlocksArgs.push_back(context.getPeriodicBoxSizePointer());
+        findInteractionsWithinBlocksArgs.push_back(context.getInvPeriodicBoxSizePointer());
+        findInteractionsWithinBlocksArgs.push_back(&context.getPosq().getDevicePointer());
+        findInteractionsWithinBlocksArgs.push_back(&interactingTiles->getDevicePointer());
+        findInteractionsWithinBlocksArgs.push_back(&blockCenter->getDevicePointer());
+        findInteractionsWithinBlocksArgs.push_back(&blockBoundingBox->getDevicePointer());
+        findInteractionsWithinBlocksArgs.push_back(&interactionFlags->getDevicePointer());
+        findInteractionsWithinBlocksArgs.push_back(&interactionCount->getDevicePointer());
+        findInteractionsWithinBlocksArgs.push_back(&maxTiles);
+    }
+}
+int CudaNonbondedUtilities::findExclusionIndex(int x, int y, const vector<unsigned int>& exclusionIndices, const vector<unsigned int>& exclusionRowIndices) {
+    int start = exclusionRowIndices[x];
+    int end = exclusionRowIndices[x+1];
+    for (int i = start; i < end; i++)
+        if (exclusionIndices[i] == y)
+            return i*CudaContext::TileSize;
+    throw OpenMMException("Internal error: exclusion in unexpected tile");
+}
+void CudaNonbondedUtilities::prepareInteractions() {
+    if (!useCutoff)
+        return;
+    if (usePeriodic) {
+        double4 box = context.getPeriodicBoxSize();
+        double minAllowedSize = 1.999999*cutoff;
+        if (box.x < minAllowedSize || box.y < minAllowedSize || box.z < minAllowedSize)
+            throw OpenMMException("The periodic box size has decreased to less than twice the nonbonded cutoff.");
+    }
+    // Compute the neighbor list.
+    context.executeKernel(findBlockBoundsKernel, &findBlockBoundsArgs[0], context.getNumAtoms());
+    context.executeKernel(findInteractingBlocksKernel, &findInteractingBlocksArgs[0], context.getNumAtoms());
+    context.executeKernel(findInteractionsWithinBlocksKernel, &findInteractionsWithinBlocksArgs[0], context.getNumAtoms(), 128);
+}
+void CudaNonbondedUtilities::computeInteractions() {
+    if (cutoff != -1.0)
+        context.executeKernel(forceKernel, &forceArgs[0], numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
+}
+void CudaNonbondedUtilities::updateNeighborListSize() {
+    if (!useCutoff)
+        return;
+    interactionCount->download(pinnedInteractionCount);
+    if (pinnedInteractionCount[0] <= (unsigned int) maxTiles)
+        return;
+    // The most recent timestep had too many interactions to fit in the arrays.  Make the arrays bigger to prevent
+    // this from happening in the future.
+    maxTiles = (int) (1.2*pinnedInteractionCount[0]);
+    int numTiles = context.getNumAtomBlocks()*(context.getNumAtomBlocks()+1)/2;
+    if (maxTiles > numTiles)
+        maxTiles = numTiles;
+    delete interactingTiles;
+    interactingTiles = CudaArray::create<ushort2>(maxTiles, "interactingTiles");
+    forceArgs[8] = &interactingTiles->getDevicePointer();
+    findInteractingBlocksArgs[5] = &interactingTiles->getDevicePointer();
+    delete interactionFlags;
+    interactionFlags = CudaArray::create<unsigned int>(maxTiles, "interactionFlags");
+    forceArgs[13] = &interactionFlags->getDevicePointer();
+    findInteractingBlocksArgs[6] = &interactionFlags->getDevicePointer();
+    findInteractionsWithinBlocksArgs[3] = &interactingTiles->getDevicePointer();
+    findInteractionsWithinBlocksArgs[6] = &interactionFlags->getDevicePointer();
+}
+void CudaNonbondedUtilities::setTileRange(int startTileIndex, int numTiles) {
+    this->startTileIndex = startTileIndex;
+    this->numTiles = numTiles;
+}
+CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source, vector<ParameterInfo>& params, vector<ParameterInfo>& arguments, bool useExclusions, bool isSymmetric) {
+    map<string, string> replacements;
+    replacements["COMPUTE_INTERACTION"] = source;
+    const string suffixes[] = {"x", "y", "z", "w"};
+    stringstream localData;
+    int localDataSize = 0;
+    for (int i = 0; i < (int) params.size(); i++) {
+        if (params[i].getNumComponents() == 1)
+            localData<<params[i].getType()<<" "<<params[i].getName()<<";\n";
+        else {
+            for (int j = 0; j < params[i].getNumComponents(); ++j)
+                localData<<params[i].getComponentType()<<" "<<params[i].getName()<<"_"<<suffixes[j]<<";\n";
+        }
+        localDataSize += params[i].getSize();
+    }
+    replacements["ATOM_PARAMETER_DATA"] = localData.str();
+    stringstream args;
+    for (int i = 0; i < (int) params.size(); i++) {
+        args << ", const ";
+        args << params[i].getType();
+        args << "* __restrict__ global_";
+        args << params[i].getName();
+    }
+    for (int i = 0; i < (int) arguments.size(); i++) {
+        args << ", const ";
+        args << arguments[i].getType();
+        args << "* __restrict__ ";
+        args << arguments[i].getName();
+    }
+    replacements["PARAMETER_ARGUMENTS"] = args.str();
+    stringstream loadLocal1;
+    for (int i = 0; i < (int) params.size(); i++) {
+        if (params[i].getNumComponents() == 1) {
+            loadLocal1<<"localData[localAtomIndex]."<<params[i].getName()<<" = "<<params[i].getName()<<"1;\n";
+        }
+        else {
+            for (int j = 0; j < params[i].getNumComponents(); ++j)
+                loadLocal1<<"localData[localAtomIndex]."<<params[i].getName()<<"_"<<suffixes[j]<<" = "<<params[i].getName()<<"1."<<suffixes[j]<<";\n";
+        }
+    }
+    replacements["LOAD_LOCAL_PARAMETERS_FROM_1"] = loadLocal1.str();
+    stringstream loadLocal2;
+    for (int i = 0; i < (int) params.size(); i++) {
+        if (params[i].getNumComponents() == 1) {
+            loadLocal2<<"localData[localAtomIndex]."<<params[i].getName()<<" = global_"<<params[i].getName()<<"[j];\n";
+        }
+        else {
+            loadLocal2<<params[i].getType()<<" temp_"<<params[i].getName()<<" = global_"<<params[i].getName()<<"[j];\n";
+            for (int j = 0; j < params[i].getNumComponents(); ++j)
+                loadLocal2<<"localData[localAtomIndex]."<<params[i].getName()<<"_"<<suffixes[j]<<" = temp_"<<params[i].getName()<<"."<<suffixes[j]<<";\n";
+        }
+    }
+    replacements["LOAD_LOCAL_PARAMETERS_FROM_GLOBAL"] = loadLocal2.str();
+    stringstream load1;
+    for (int i = 0; i < (int) params.size(); i++) {
+        load1 << params[i].getType();
+        load1 << " ";
+        load1 << params[i].getName();
+        load1 << "1 = global_";
+        load1 << params[i].getName();
+        load1 << "[atom1];\n";
+    }
+    replacements["LOAD_ATOM1_PARAMETERS"] = load1.str();
+    stringstream load2j;
+    for (int i = 0; i < (int) params.size(); i++) {
+        if (params[i].getNumComponents() == 1) {
+            load2j<<params[i].getType()<<" "<<params[i].getName()<<"2 = localData[atom2]."<<params[i].getName()<<";\n";
+        }
+        else {
+            load2j<<params[i].getType()<<" "<<params[i].getName()<<"2 = make_"<<params[i].getType()<<"(";
+            for (int j = 0; j < params[i].getNumComponents(); ++j) {
+                if (j > 0)
+                    load2j<<", ";
+                load2j<<"localData[atom2]."<<params[i].getName()<<"_"<<suffixes[j];
+            }
+            load2j<<");\n";
+        }
+    }
+    replacements["LOAD_ATOM2_PARAMETERS"] = load2j.str();
+    map<string, string> defines;
+    if (useCutoff)
+        defines["USE_CUTOFF"] = "1";
+    if (usePeriodic)
+        defines["USE_PERIODIC"] = "1";
+    if (useExclusions)
+        defines["USE_EXCLUSIONS"] = "1";
+    if (isSymmetric)
+        defines["USE_SYMMETRIC"] = "1";
+    defines["THREAD_BLOCK_SIZE"] = context.intToString(forceThreadBlockSize);
+    defines["CUTOFF_SQUARED"] = context.doubleToString(cutoff*cutoff);
+    defines["NUM_ATOMS"] = context.intToString(context.getNumAtoms());
+    defines["PADDED_NUM_ATOMS"] = context.intToString(context.getPaddedNumAtoms());
+    defines["NUM_BLOCKS"] = context.intToString(context.getNumAtomBlocks());
+    if ((localDataSize/4)%2 == 0 && !context.getUseDoublePrecision())
+        defines["PARAMETER_SIZE_IS_EVEN"] = "1";
+    string file;
+    CUmodule program = context.createModule(context.replaceStrings(CudaKernelSources::vectorOps+CudaKernelSources::nonbonded, replacements), defines);
+    CUfunction kernel = context.getKernel(program, "computeNonbonded");
+    // Set arguments to the Kernel.
+    int index = 0;
+    forceArgs.push_back(&context.getForce().getDevicePointer());
+    forceArgs.push_back(&context.getEnergyBuffer().getDevicePointer());
+    forceArgs.push_back(&context.getPosq().getDevicePointer());
+    forceArgs.push_back(&exclusions->getDevicePointer());
+    forceArgs.push_back(&exclusionIndices->getDevicePointer());
+    forceArgs.push_back(&exclusionRowIndices->getDevicePointer());
+    forceArgs.push_back(&startTileIndex);
+    forceArgs.push_back(&numTiles);
+    if (useCutoff) {
+        forceArgs.push_back(&interactingTiles->getDevicePointer());
+        forceArgs.push_back(&interactionCount->getDevicePointer());
+        forceArgs.push_back(context.getPeriodicBoxSizePointer());
+        forceArgs.push_back(context.getInvPeriodicBoxSizePointer());
+        forceArgs.push_back(&maxTiles);
+        forceArgs.push_back(&interactionFlags->getDevicePointer());
+    }
+    for (int i = 0; i < (int) params.size(); i++)
+        forceArgs.push_back(&params[i].getMemory());
+    for (int i = 0; i < (int) arguments.size(); i++)
+        forceArgs.push_back(&arguments[i].getMemory());
+    return kernel;
+}
--- a/platforms/cuda2/src/CudaNonbondedUtilities.h
+++ b/platforms/cuda2/src/CudaNonbondedUtilities.h
@@ -38,7 +38,7 @@ namespace OpenMM {
 /**
 * This class provides a generic interface for calculating nonbonded interactions.  It does this in two
- * ways.  First, it can be used to create Kernels that evaluate nonbonded interactions.  Clients
+ * ways.  First, it can be used to create kernels that evaluate nonbonded interactions.  Clients
 * only need to provide the code for evaluating a single interaction and the list of parameters it depends on.
 * A complete kernel is then synthesized using an appropriate algorithm to evaluate all interactions on all
 * atoms.
@@ -64,209 +64,199 @@ namespace OpenMM {
 class OPENMM_EXPORT CudaNonbondedUtilities {
 public:
    class ParameterInfo;
-//    CudaNonbondedUtilities(CudaContext& context);
+    CudaNonbondedUtilities(CudaContext& context);
-//    ~CudaNonbondedUtilities();
+    ~CudaNonbondedUtilities();
-//    /**
+    /**
-//     * Add a nonbonded interaction to be evaluated by the default interaction kernel.
+     * Add a nonbonded interaction to be evaluated by the default interaction kernel.
-//     *
+     *
-//     * @param usesCutoff     specifies whether a cutoff should be applied to this interaction
+     * @param usesCutoff     specifies whether a cutoff should be applied to this interaction
-//     * @param usesPeriodic   specifies whether periodic boundary conditions should be applied to this interaction
+     * @param usesPeriodic   specifies whether periodic boundary conditions should be applied to this interaction
-//     * @param usesExclusions specifies whether this interaction uses exclusions.  If this is true, it must have identical exclusions to every other interaction.
+     * @param usesExclusions specifies whether this interaction uses exclusions.  If this is true, it must have identical exclusions to every other interaction.
-//     * @param cutoffDistance the cutoff distance for this interaction (ignored if usesCutoff is false)
+     * @param cutoffDistance the cutoff distance for this interaction (ignored if usesCutoff is false)
-//     * @param exclusionList  for each atom, specifies the list of other atoms whose interactions should be excluded
+     * @param exclusionList  for each atom, specifies the list of other atoms whose interactions should be excluded
-//     * @param kernel         the code to evaluate the interaction
+     * @param kernel         the code to evaluate the interaction
-//     * @param forceGroup     the force group in which the interaction should be calculated
+     * @param forceGroup     the force group in which the interaction should be calculated
-//     */
+     */
-//    void addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const std::vector<std::vector<int> >& exclusionList, const std::string& kernel, int forceGroup);
+    void addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const std::vector<std::vector<int> >& exclusionList, const std::string& kernel, int forceGroup);
-//    /**
+    /**
-//     * Add a per-atom parameter that the default interaction kernel may depend on.
+     * Add a per-atom parameter that the default interaction kernel may depend on.
-//     */
+     */
-//    void addParameter(const ParameterInfo& parameter);
+    void addParameter(const ParameterInfo& parameter);
-//    /**
+    /**
-//     * Add an array (other than a per-atom parameter) that should be passed as an argument to the default interaction kernel.
+     * Add an array (other than a per-atom parameter) that should be passed as an argument to the default interaction kernel.
-//     */
+     */
-//    void addArgument(const ParameterInfo& parameter);
+    void addArgument(const ParameterInfo& parameter);
-//    /**
+    /**
-//     * Specify the list of exclusions that an interaction outside the default kernel will depend on.
+     * Specify the list of exclusions that an interaction outside the default kernel will depend on.
-//     * 
+     * 
-//     * @param exclusionList  for each atom, specifies the list of other atoms whose interactions should be excluded
+     * @param exclusionList  for each atom, specifies the list of other atoms whose interactions should be excluded
-//     */
+     */
-//    void requestExclusions(const std::vector<std::vector<int> >& exclusionList);
+    void requestExclusions(const std::vector<std::vector<int> >& exclusionList);
-//    /**
+    /**
-//     * Initialize this object in preparation for a simulation.
+     * Initialize this object in preparation for a simulation.
-//     */
+     */
-//    void initialize(const System& system);
+    void initialize(const System& system);
-//    /**
+    /**
-//     * Get the number of force buffers required for nonbonded forces.
+     * Get the number of energy buffers required for nonbonded forces.
-//     */
+     */
-//    int getNumForceBuffers() {
+    int getNumEnergyBuffers() {
-//        return numForceBuffers;
+        return numForceThreadBlocks*forceThreadBlockSize;
-//    }
+    }
-//    /**
+    /**
-//     * Get the number of energy buffers required for nonbonded forces.
+     * Get whether a cutoff is being used.
-//     */
+     */
-//    int getNumEnergyBuffers() {
+    bool getUseCutoff() {
-//        return numForceThreadBlocks*forceThreadBlockSize;
+        return useCutoff;
-//    }
+    }
-//    /**
+    /**
-//     * Get whether a cutoff is being used.
+     * Get whether periodic boundary conditions are being used.
-//     */
+     */
-//    bool getUseCutoff() {
+    bool getUsePeriodic() {
-//        return useCutoff;
+        return usePeriodic;
-//    }
+    }
-//    /**
+    /**
-//     * Get whether periodic boundary conditions are being used.
+     * Get the number of work groups used for computing nonbonded forces.
-//     */
+     */
-//    bool getUsePeriodic() {
+    int getNumForceThreadBlocks() {
-//        return usePeriodic;
+        return numForceThreadBlocks;
-//    }
+    }
-//    /**
+    /**
-//     * Get whether there is one force buffer per atom block.
+     * Get the size of each work group used for computing nonbonded forces.
-//     */
+     */
-//    bool getForceBufferPerAtomBlock() {
+    int getForceThreadBlockSize() {
-//        return forceBufferPerAtomBlock;
+        return forceThreadBlockSize;
-//    }
+    }
-//    /**
+    /**
-//     * Get the number of work groups used for computing nonbonded forces.
+     * Get the cutoff distance.
-//     */
+     */
-//    int getNumForceThreadBlocks() {
+    double getCutoffDistance() {
-//        return numForceThreadBlocks;
+        return cutoff;
-//    }
+    }
-//    /**
+    /**
-//     * Get the size of each work group used for computing nonbonded forces.
+     * Get whether any interactions have been added.
-//     */
+     */
-//    int getForceThreadBlockSize() {
+    bool getHasInteractions() {
-//        return forceThreadBlockSize;
+        return cutoff != -1.0;
-//    }
+    }
-//    /**
+    /**
-//     * Get the cutoff distance.
+     * Get the force group in which nonbonded interactions should be computed.
-//     */
+     */
-//    double getCutoffDistance() {
+    int getForceGroup() {
-//        return cutoff;
+        return nonbondedForceGroup;
-//    }
+    }
-//    /**
+    /**
-//     * Get whether any interactions have been added.
+     * Prepare to compute interactions.  This updates the neighbor list.
-//     */
+     */
-//    bool getHasInteractions() {
+    void prepareInteractions();
-//        return cutoff != -1.0;
+    /**
-//    }
+     * Compute the nonbonded interactions.
-//    /**
+     */
-//     * Get the force group in which nonbonded interactions should be computed.
+    void computeInteractions();
-//     */
+    /**
-//    int getForceGroup() {
+     * Check to see if the neighbor list arrays are large enough, and make them bigger if necessary.
-//        return nonbondedForceGroup;
+     */
-//    }
+    void updateNeighborListSize();
-//    /**
+    /**
-//     * Prepare to compute interactions.  This updates the neighbor list.
+     * Get the array containing the center of each atom block.
-//     */
+     */
-//    void prepareInteractions();
+    CudaArray& getBlockCenters() {
-//    /**
+        return *blockCenter;
-//     * Compute the nonbonded interactions.
+    }
-//     */
+    /**
-//    void computeInteractions();
+     * Get the array containing the dimensions of each atom block.
-//    /**
+     */
-//     * Check to see if the neighbor list arrays are large enough, and make them bigger if necessary.
+    CudaArray& getBlockBoundingBoxes() {
-//     */
+        return *blockBoundingBox;
-//    void updateNeighborListSize();
+    }
-//    /**
+    /**
-//     * Get the array containing the center of each atom block.
+     * Get the array whose first element contains the number of tiles with interactions.
-//     */
+     */
-//    CudaArray<mm_float4>& getBlockCenters() {
+    CudaArray& getInteractionCount() {
-//        return *blockCenter;
+        return *interactionCount;
-//    }
+    }
-//    /**
+    /**
-//     * Get the array containing the dimensions of each atom block.
+     * Get the array containing tiles with interactions.
-//     */
+     */
-//    CudaArray<mm_float4>& getBlockBoundingBoxes() {
+    CudaArray& getInteractingTiles() {
-//        return *blockBoundingBox;
+        return *interactingTiles;
-//    }
+    }
-//    /**
+    /**
-//     * Get the array whose first element contains the number of tiles with interactions.
+     * Get the array containing flags for tiles with interactions.
-//     */
+     */
-//    CudaArray<cl_uint>& getInteractionCount() {
+    CudaArray& getInteractionFlags() {
-//        return *interactionCount;
+        return *interactionFlags;
-//    }
+    }
-//    /**
+    /**
-//     * Get the array containing tiles with interactions.
+     * Get the array containing exclusion flags.
-//     */
+     */
-//    CudaArray<mm_ushort2>& getInteractingTiles() {
+    CudaArray& getExclusions() {
-//        return *interactingTiles;
+        return *exclusions;
-//    }
+    }
-//    /**
+    /**
-//     * Get the array containing flags for tiles with interactions.
+     * Get the array containing the index into the exclusion array for each tile.
-//     */
+     */
-//    CudaArray<cl_uint>& getInteractionFlags() {
+    CudaArray& getExclusionIndices() {
-//        return *interactionFlags;
+        return *exclusionIndices;
-//    }
+    }
-//    /**
+    /**
-//     * Get the array containing exclusion flags.
+     * Get the array listing where the exclusion data starts for each row.
-//     */
+     */
-//    CudaArray<cl_uint>& getExclusions() {
+    CudaArray& getExclusionRowIndices() {
-//        return *exclusions;
+        return *exclusionRowIndices;
-//    }
+    }
-//    /**
+    /**
-//     * Get the array containing the index into the exclusion array for each tile.
+     * Get the index of the first tile this context is responsible for processing.
-//     */
+     */
-//    CudaArray<cl_uint>& getExclusionIndices() {
+    int getStartTileIndex() const {
-//        return *exclusionIndices;
+        return startTileIndex;
-//    }
+    }
-//    /**
+    /**
-//     * Get the array listing where the exclusion data starts for each row.
+     * Get the total number of tiles this context is responsible for processing.
-//     */
+     */
-//    CudaArray<cl_uint>& getExclusionRowIndices() {
+    int getNumTiles() const {
-//        return *exclusionRowIndices;
+        return numTiles;
-//    }
+    }
-//    /**
+    /**
-//     * Get the index of the first tile this context is responsible for processing.
+     * Set the range of tiles that should be processed by this context.
-//     */
+     */
-//    int getStartTileIndex() const {
+    void setTileRange(int startTileIndex, int numTiles);
-//        return startTileIndex;
+    /**
-//    }
+     * Create a Kernel for evaluating a nonbonded interaction.  Cutoffs and periodic boundary conditions
-//    /**
+     * are assumed to be the same as those for the default interaction Kernel, since this kernel will use
-//     * Get the total number of tiles this context is responsible for processing.
+     * the same neighbor list.
-//     */
+     * 
-//    int getNumTiles() const {
+     * @param source        the source code for evaluating the force and energy
-//        return numTiles;
+     * @param params        the per-atom parameters this kernel may depend on
-//    }
+     * @param arguments     arrays (other than per-atom parameters) that should be passed as arguments to the kernel
-//    /**
+     * @param useExclusions specifies whether exclusions are applied to this interaction
-//     * Set the range of tiles that should be processed by this context.
+     * @param isSymmetric   specifies whether the interaction is symmetric
-//     */
+     */
-//    void setTileRange(int startTileIndex, int numTiles);
+    CUfunction createInteractionKernel(const std::string& source, std::vector<ParameterInfo>& params, std::vector<ParameterInfo>& arguments, bool useExclusions, bool isSymmetric);
-//    /**
-//     * Create a Kernel for evaluating a nonbonded interaction.  Cutoffs and periodic boundary conditions
-//     * are assumed to be the same as those for the default interaction Kernel, since this kernel will use
-//     * the same neighbor list.
-//     * 
-//     * @param source        the source code for evaluating the force and energy
-//     * @param params        the per-atom parameters this kernel may depend on
-//     * @param arguments     arrays (other than per-atom parameters) that should be passed as arguments to the kernel
-//     * @param useExclusions specifies whether exclusions are applied to this interaction
-//     * @param isSymmetric   specifies whether the interaction is symmetric
-//     */
-//    cl::Kernel createInteractionKernel(const std::string& source, const std::vector<ParameterInfo>& params, const std::vector<ParameterInfo>& arguments, bool useExclusions, bool isSymmetric) const;
 private:
-//    static int findExclusionIndex(int x, int y, const std::vector<cl_uint>& exclusionIndices, const std::vector<cl_uint>& exclusionRowIndices);
+    static int findExclusionIndex(int x, int y, const std::vector<unsigned int>& exclusionIndices, const std::vector<unsigned int>& exclusionRowIndices);
-//    CudaContext& context;
+    CudaContext& context;
-//    cl::Kernel forceKernel;
+    CUfunction forceKernel;
-//    cl::Kernel findBlockBoundsKernel;
+    CUfunction findBlockBoundsKernel;
-//    cl::Kernel findInteractingBlocksKernel;
+    CUfunction findInteractingBlocksKernel;
-//    cl::Kernel findInteractionsWithinBlocksKernel;
+    CUfunction findInteractionsWithinBlocksKernel;
-//    CudaArray<cl_uint>* exclusions;
+    CudaArray* exclusions;
-//    CudaArray<cl_uint>* exclusionIndices;
+    CudaArray* exclusionIndices;
-//    CudaArray<cl_uint>* exclusionRowIndices;
+    CudaArray* exclusionRowIndices;
-//    CudaArray<mm_ushort2>* interactingTiles;
+    CudaArray* interactingTiles;
-//    CudaArray<cl_uint>* interactionFlags;
+    CudaArray* interactionFlags;
-//    CudaArray<cl_uint>* interactionCount;
+    CudaArray* interactionCount;
-//    CudaArray<mm_float4>* blockCenter;
+    CudaArray* blockCenter;
-//    CudaArray<mm_float4>* blockBoundingBox;
+    CudaArray* blockBoundingBox;
-//    std::vector<std::vector<int> > atomExclusions;
+    unsigned int* pinnedInteractionCount;
-//    std::vector<ParameterInfo> parameters;
+    std::vector<void*> forceArgs, findBlockBoundsArgs, findInteractingBlocksArgs, findInteractionsWithinBlocksArgs;
-//    std::vector<ParameterInfo> arguments;
+    std::vector<std::vector<int> > atomExclusions;
-//    std::string kernelSource;
+    std::vector<ParameterInfo> parameters;
-//    std::map<std::string, std::string> kernelDefines;
+    std::vector<ParameterInfo> arguments;
-//    double cutoff;
+    std::string kernelSource;
-//    bool useCutoff, usePeriodic, forceBufferPerAtomBlock, deviceIsCpu, anyExclusions;
+    std::map<std::string, std::string> kernelDefines;
-//    int numForceBuffers, startTileIndex, numTiles, numForceThreadBlocks, forceThreadBlockSize, nonbondedForceGroup;
+    double cutoff;
+    bool useCutoff, usePeriodic, anyExclusions;
+    int startTileIndex, numTiles, maxTiles, numForceThreadBlocks, forceThreadBlockSize, nonbondedForceGroup, numAtoms;
 };
 /**
@@ -309,7 +299,7 @@ public:
    int getSize() const {
        return size;
    }
-    CUdeviceptr getMemory() const {
+    CUdeviceptr& getMemory() {
        return memory;
    }
 private:

--- a/platforms/cuda2/src/CudaParameterSet.cpp
+++ b/platforms/cuda2/src/CudaParameterSet.cpp
@@ -77,7 +77,7 @@ CudaParameterSet::~CudaParameterSet() {
        CHECK_RESULT(cuMemFree(buffers[i].getMemory()));
 }
-void CudaParameterSet::getParameterValues(vector<vector<float> >& values) const {
+void CudaParameterSet::getParameterValues(vector<vector<float> >& values) {
    values.resize(numObjects);
    for (int i = 0; i < numObjects; i++)
        values[i].resize(numParameters);

--- a/platforms/cuda2/src/CudaParameterSet.h
+++ b/platforms/cuda2/src/CudaParameterSet.h
@@ -71,7 +71,7 @@ public:
     *
     * @param values on exit, values[i][j] contains the value of parameter j for object i
     */
-    void getParameterValues(std::vector<std::vector<float> >& values) const;
+    void getParameterValues(std::vector<std::vector<float> >& values);
    /**
     * Set the values of all parameters.
     *
@@ -82,7 +82,7 @@ public:
     * Get a set of CudaNonbondedUtilities::ParameterInfo objects which describe the Buffers
     * containing the data.
     */
-    const std::vector<CudaNonbondedUtilities::ParameterInfo>& getBuffers() const {
+    std::vector<CudaNonbondedUtilities::ParameterInfo>& getBuffers() {
        return buffers;
    }
    /**

--- a/platforms/cuda2/src/CudaSort.h
+++ b/platforms/cuda2/src/CudaSort.h
@@ -41,7 +41,7 @@ namespace OpenMM {
 * sort and the key for sorting it.  Here is an example of a trait class for
 * sorting floats:
 * 
- * class SortTrait : public CudaSort::SortTrait {
+ * class FloatTrait : public CudaSort::SortTrait {
 *     int getDataSize() const {return 4;}
 *     int getKeySize() const {return 4;}
 *     const char* getDataType() const {return "float";}

--- a/platforms/cuda2/src/kernels/coulombLennardJones.cu
+++ b/platforms/cuda2/src/kernels/coulombLennardJones.cu
+#if USE_EWALD
+bool needCorrection = isExcluded && atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS;
+if (!isExcluded || needCorrection) {
+    real tempForce = 0.0f;
+    if (r2 < CUTOFF_SQUARED || needCorrection) {
+        const real alphaR = EWALD_ALPHA*r;
+        const real expAlphaRSqr = EXP(-alphaR*alphaR);
+        const real prefactor = 138.935456f*posq1.w*posq2.w*invR;
+        // This approximation for erfc is from Abramowitz and Stegun (1964) p. 299.  They cite the following as
+        // the original source: C. Hastings, Jr., Approximations for Digital Computers (1955).  It has a maximum
+        // error of 3e-7.
+        real t = 1.0f+(0.0705230784f+(0.0422820123f+(0.0092705272f+(0.0001520143f+(0.0002765672f+0.0000430638f*alphaR)*alphaR)*alphaR)*alphaR)*alphaR)*alphaR;
+        t *= t;
+        t *= t;
+        t *= t;
+        const real erfcAlphaR = RECIP(t*t);
+        if (needCorrection) {
+            // Subtract off the part of this interaction that was included in the reciprocal space contribution.
+            tempForce = -prefactor*((1.0f-erfcAlphaR)-alphaR*expAlphaRSqr*TWO_OVER_SQRT_PI);
+            tempEnergy += -prefactor*(1.0f-erfcAlphaR);
+        }
+        else {
+#if HAS_LENNARD_JONES
+            real sig = sigmaEpsilon1.x + sigmaEpsilon2.x;
+            real sig2 = invR*sig;
+            sig2 *= sig2;
+            real sig6 = sig2*sig2*sig2;
+            real epssig6 = sig6*(sigmaEpsilon1.y*sigmaEpsilon2.y);
+            tempForce = epssig6*(12.0f*sig6 - 6.0f) + prefactor*(erfcAlphaR+alphaR*expAlphaRSqr*TWO_OVER_SQRT_PI);
+            tempEnergy += epssig6*(sig6 - 1.0f) + prefactor*erfcAlphaR;
+#else
+            tempForce = prefactor*(erfcAlphaR+alphaR*expAlphaRSqr*TWO_OVER_SQRT_PI);
+            tempEnergy += prefactor*erfcAlphaR;
+#endif
+        }
+    }
+    dEdR += tempForce*invR*invR;
+}
+#else
+{
+#ifdef USE_CUTOFF
+    unsigned int includeInteraction = (!isExcluded && r2 < CUTOFF_SQUARED);
+#else
+    unsigned int includeInteraction = (!isExcluded);
+#endif
+    real tempForce = 0.0f;
+  #if HAS_LENNARD_JONES
+    real sig = sigmaEpsilon1.x + sigmaEpsilon2.x;
+    real sig2 = invR*sig;
+    sig2 *= sig2;
+    real sig6 = sig2*sig2*sig2;
+    real epssig6 = sig6*(sigmaEpsilon1.y*sigmaEpsilon2.y);
+    tempForce = epssig6*(12.0f*sig6 - 6.0f);
+    tempEnergy += includeInteraction ? epssig6*(sig6 - 1) : 0;
+  #endif
+#if HAS_COULOMB
+  #ifdef USE_CUTOFF
+    const real prefactor = 138.935456f*posq1.w*posq2.w;
+    tempForce += prefactor*(invR - 2.0f*REACTION_FIELD_K*r2);
+    tempEnergy += includeInteraction ? prefactor*(invR + REACTION_FIELD_K*r2 - REACTION_FIELD_C) : 0;
+  #else
+    const real prefactor = 138.935456f*posq1.w*posq2.w*invR;
+    tempForce += prefactor;
+    tempEnergy += includeInteraction ? prefactor : 0;
+  #endif
+#endif
+    dEdR += includeInteraction ? tempForce*invR*invR : 0;
+}
+#endif
\ No newline at end of file
--- a/platforms/cuda2/src/kernels/customCompoundBond.cu
+++ b/platforms/cuda2/src/kernels/customCompoundBond.cu
 /**
 * Convert a real4 to a real3 by removing its last element.
 */
-__device__ real3 ccb_trim(real4 v) {
+inline __device__ real3 ccb_trim(real4 v) {
    return make_real3(v.x, v.y, v.z);
 }
 /**
 * Compute the difference between two vectors, setting the fourth component to the squared magnitude.
 */
-__device__ real4 ccb_delta(real4 vec1, real4 vec2) {
+inline __device__ real4 ccb_delta(real4 vec1, real4 vec2) {
    real4 result = make_real4(vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0);
    result.w = result.x*result.x + result.y*result.y + result.z*result.z;
    return result;
@@ -38,7 +38,7 @@ __device__ real ccb_computeAngle(real4 vec1, real4 vec2) {
 /**
 * Compute the cross product of two vectors, setting the fourth component to the squared magnitude.
 */
-__device__ real4 ccb_computeCross(real4 vec1, real4 vec2) {
+inline __device__ real4 ccb_computeCross(real4 vec1, real4 vec2) {
    real3 cp = cross(vec1, vec2);
    return make_real4(cp.x, cp.y, cp.z, cp.x*cp.x+cp.y*cp.y+cp.z*cp.z);
 }
--- a/platforms/cuda2/src/kernels/ewald.cu
+++ b/platforms/cuda2/src/kernels/ewald.cu
+__device__ real2 multofReal2(real2 a, real2 b) {
+    return make_real2(a.x*b.x - a.y*b.y, a.x*b.y + a.y*b.x);
+}
+/**
+ * Precompute the cosine and sine sums which appear in each force term.
+ */
+extern "C" __global__ void calculateEwaldCosSinSums(real* __restrict__ energyBuffer, const real4* __restrict__ posq, real2* __restrict__ cosSinSum, real4 periodicBoxSize) {
+    const unsigned int ksizex = 2*KMAX_X-1;
+    const unsigned int ksizey = 2*KMAX_Y-1;
+    const unsigned int ksizez = 2*KMAX_Z-1;
+    const unsigned int totalK = ksizex*ksizey*ksizez;
+    real3 reciprocalBoxSize = make_real3(2*M_PI/periodicBoxSize.x, 2*M_PI/periodicBoxSize.y, 2*M_PI/periodicBoxSize.z);
+    real reciprocalCoefficient = ONE_4PI_EPS0*4*M_PI/(periodicBoxSize.x*periodicBoxSize.y*periodicBoxSize.z);
+    unsigned int index = blockIdx.x*blockDim.x+threadIdx.x;
+    real energy = 0;
+    while (index < (KMAX_Y-1)*ksizez+KMAX_Z)
+        index += blockDim.x*gridDim.x;
+    while (index < totalK) {
+        // Find the wave vector (kx, ky, kz) this index corresponds to.
+        int rx = index/(ksizey*ksizez);
+        int remainder = index - rx*ksizey*ksizez;
+        int ry = remainder/ksizez;
+        int rz = remainder - ry*ksizez - KMAX_Z + 1;
+        ry += -KMAX_Y + 1;
+        real kx = rx*reciprocalBoxSize.x;
+        real ky = ry*reciprocalBoxSize.y;
+        real kz = rz*reciprocalBoxSize.z;
+        // Compute the sum for this wave vector.
+        real2 sum = make_real2(0);
+        for (int atom = 0; atom < NUM_ATOMS; atom++) {
+            real4 apos = posq[atom];
+            real phase = apos.x*kx;
+            real2 structureFactor = make_real2(cos(phase), sin(phase));
+            phase = apos.y*ky;
+            structureFactor = multofReal2(structureFactor, make_real2(cos(phase), sin(phase)));
+            phase = apos.z*kz;
+            structureFactor = multofReal2(structureFactor, make_real2(cos(phase), sin(phase)));
+            sum += apos.w*structureFactor;
+        }
+        cosSinSum[index] = sum;
+        // Compute the contribution to the energy.
+        real k2 = kx*kx + ky*ky + kz*kz;
+        real ak = EXP(k2*EXP_COEFFICIENT) / k2;
+        energy += reciprocalCoefficient*ak*(sum.x*sum.x + sum.y*sum.y);
+        index += blockDim.x*gridDim.x;
+    }
+    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
+}
+/**
+ * Compute the reciprocal space part of the Ewald force, using the precomputed sums from the
+ * previous routine.
+ */
+extern "C" __global__ void calculateEwaldForces(unsigned long long* __restrict__ forceBuffers, const real4* __restrict__ posq, const real2* __restrict__ cosSinSum, real4 periodicBoxSize) {
+    unsigned int atom = blockIdx.x*blockDim.x+threadIdx.x;
+    real3 reciprocalBoxSize = make_real3(2*M_PI/periodicBoxSize.x, 2*M_PI/periodicBoxSize.y, 2*M_PI/periodicBoxSize.z);
+    real reciprocalCoefficient = ONE_4PI_EPS0*4*M_PI/(periodicBoxSize.x*periodicBoxSize.y*periodicBoxSize.z);
+    while (atom < NUM_ATOMS) {
+        real3 force = make_real3(0);
+        real4 apos = posq[atom];
+        // Loop over all wave vectors.
+        int lowry = 0;
+        int lowrz = 1;
+        for (int rx = 0; rx < KMAX_X; rx++) {
+            real kx = rx*reciprocalBoxSize.x;
+            for (int ry = lowry; ry < KMAX_Y; ry++) {
+                real ky = ry*reciprocalBoxSize.y;
+                real phase = apos.x*kx;
+                real2 tab_xy = make_real2(cos(phase), sin(phase));
+                phase = apos.y*ky;
+                tab_xy = multofReal2(tab_xy, make_real2(cos(phase), sin(phase)));
+                for (int rz = lowrz; rz < KMAX_Z; rz++) {
+                    real kz = rz*reciprocalBoxSize.z;
+                    // Compute the force contribution of this wave vector.
+                    int index = rx*(KMAX_Y*2-1)*(KMAX_Z*2-1) + (ry+KMAX_Y-1)*(KMAX_Z*2-1) + (rz+KMAX_Z-1);
+                    real k2 = kx*kx + ky*ky + kz*kz;
+                    real ak = EXP(k2*EXP_COEFFICIENT)/k2;
+                    phase = apos.z*kz;
+                    real2 structureFactor = multofReal2(tab_xy, make_real2(cos(phase), sin(phase)));
+                    real2 sum = cosSinSum[index];
+                    real dEdR = 2*reciprocalCoefficient*ak*apos.w*(sum.x*structureFactor.y - sum.y*structureFactor.x);
+                    force.x += dEdR*kx;
+                    force.y += dEdR*ky;
+                    force.z += dEdR*kz;
+                    lowrz = 1 - KMAX_Z;
+                }
+                lowry = 1 - KMAX_Y;
+            }
+        }
+        // Record the force on the atom.
+        atomicAdd(&forceBuffers[atom], static_cast<unsigned long long>((long long) (force.x*0xFFFFFFFF)));
+        atomicAdd(&forceBuffers[atom+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0xFFFFFFFF)));
+        atomicAdd(&forceBuffers[atom+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0xFFFFFFFF)));
+        atom += blockDim.x*gridDim.x;
+    }
+}
--- a/platforms/cuda2/src/kernels/findInteractingBlocks.cu
+++ b/platforms/cuda2/src/kernels/findInteractingBlocks.cu
+#define TILE_SIZE 32
+#define GROUP_SIZE 64
+#define BUFFER_GROUPS 4
+#define BUFFER_SIZE BUFFER_GROUPS*GROUP_SIZE
+/**
+ * Find a bounding box for the atoms in each block.
+ */
+extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, const real4* __restrict__ posq, real4* __restrict__ blockCenter, real4* __restrict__ blockBoundingBox, unsigned int* __restrict__ interactionCount) {
+    int index = blockIdx.x*blockDim.x+threadIdx.x;
+    int base = index*TILE_SIZE;
+    while (base < numAtoms) {
+        real4 pos = posq[base];
+#ifdef USE_PERIODIC
+        pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
+        pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
+        pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
+        real4 firstPoint = pos;
+#endif
+        real4 minPos = pos;
+        real4 maxPos = pos;
+        int last = min(base+TILE_SIZE, numAtoms);
+        for (int i = base+1; i < last; i++) {
+            pos = posq[i];
+#ifdef USE_PERIODIC
+            pos.x -= floor((pos.x-firstPoint.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+            pos.y -= floor((pos.y-firstPoint.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+            pos.z -= floor((pos.z-firstPoint.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+#endif
+            minPos = make_real4(min(minPos.x,pos.x), min(minPos.y,pos.y), min(minPos.z,pos.z), 0);
+            maxPos = make_real4(max(maxPos.x,pos.x), max(maxPos.y,pos.y), max(maxPos.z,pos.z), 0);
+        }
+        blockBoundingBox[index] = 0.5f*(maxPos-minPos);
+        blockCenter[index] = 0.5f*(maxPos+minPos);
+        index += blockDim.x*gridDim.x;
+        base = index*TILE_SIZE;
+    }
+    if (blockIdx.x == 0 && threadIdx.x == 0)
+        interactionCount[0] = 0;
+}
+/**
+ * This is called by findBlocksWithInteractions().  It compacts the list of blocks and writes them
+ * to global memory.
+ */
+__device__ void storeInteractionData(ushort2* buffer, int* valid, short* sum, ushort2* temp, int* baseIndex,
+            unsigned int* interactionCount, ushort2* interactingTiles, real4 periodicBoxSize,
+            real4 invPeriodicBoxSize, const real4* posq, const real4* blockCenter, const real4* blockBoundingBox, unsigned int maxTiles) {
+    // The buffer is full, so we need to compact it and write out results.  Start by doing a parallel prefix sum.
+    for (int i = threadIdx.x; i < BUFFER_SIZE; i += GROUP_SIZE)
+        temp[i].x = (valid[i] ? 1 : 0);
+    __syncthreads();
+    int whichBuffer = 0;
+    for (int offset = 1; offset < BUFFER_SIZE; offset *= 2) {
+        if (whichBuffer == 0)
+            for (int i = threadIdx.x; i < BUFFER_SIZE; i += GROUP_SIZE)
+                temp[i].y = (i < offset ? temp[i].x : temp[i].x+temp[i-offset].x);
+        else
+            for (int i = threadIdx.x; i < BUFFER_SIZE; i += GROUP_SIZE)
+                temp[i].x = (i < offset ? temp[i].y : temp[i].y+temp[i-offset].y);
+        whichBuffer = 1-whichBuffer;
+        __syncthreads();
+    }
+    if (whichBuffer == 0)
+        for (int i = threadIdx.x; i < BUFFER_SIZE; i += GROUP_SIZE)
+            sum[i] = temp[i].x;
+    else
+        for (int i = threadIdx.x; i < BUFFER_SIZE; i += GROUP_SIZE)
+            sum[i] = temp[i].y;
+    __syncthreads();
+    int numValid = sum[BUFFER_SIZE-1];
+    __syncthreads();
+    // Compact the buffer.
+    for (int i = threadIdx.x; i < BUFFER_SIZE; i += GROUP_SIZE)
+        if (valid[i]) {
+            temp[sum[i]-1] = buffer[i];
+            sum[i] = valid[i];
+            valid[i] = false;
+            buffer[i] = make_ushort2(1, 1);
+        }
+    __syncthreads();
+    // Store it to global memory.
+    if (threadIdx.x == 0)
+        *baseIndex = atomicAdd(interactionCount, numValid);
+    __syncthreads();
+    if (*baseIndex+numValid <= maxTiles)
+        for (int i = threadIdx.x; i < numValid; i += GROUP_SIZE)
+            interactingTiles[*baseIndex+i] = temp[i];
+    __syncthreads();
+}
+/**
+ * Compare the bounding boxes for each pair of blocks.  If they are sufficiently far apart,
+ * mark them as non-interacting.
+ */
+extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodicBoxSize, const real4* __restrict__ blockCenter,
+        const real4* __restrict__ blockBoundingBox, unsigned int* __restrict__ interactionCount, ushort2* __restrict__ interactingTiles,
+        unsigned int* __restrict__ interactionFlags, const real4* __restrict__ posq, unsigned int maxTiles, unsigned int startTileIndex,
+        unsigned int endTileIndex) {
+    __shared__ ushort2 buffer[BUFFER_SIZE];
+    __shared__ int valid[BUFFER_SIZE];
+    __shared__ short sum[BUFFER_SIZE];
+    __shared__ ushort2 temp[BUFFER_SIZE];
+    __shared__ int bufferFull;
+    __shared__ int globalIndex;
+    int valuesInBuffer = 0;
+    if (threadIdx.x == 0)
+        bufferFull = false;
+    for (int i = 0; i < BUFFER_GROUPS; ++i)
+        valid[i*GROUP_SIZE+threadIdx.x] = false;
+    __syncthreads();
+    for (int baseIndex = startTileIndex+blockIdx.x*blockDim.x; baseIndex < endTileIndex; baseIndex += blockDim.x*gridDim.x) {
+        // Identify the pair of blocks to compare.
+        int index = baseIndex+threadIdx.x;
+        if (index < endTileIndex) {
+            unsigned int y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*index));
+            unsigned int x = (index-y*NUM_BLOCKS+y*(y+1)/2);
+            if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
+                y += (x < y ? -1 : 1);
+                x = (index-y*NUM_BLOCKS+y*(y+1)/2);
+            }
+            // Find the distance between the bounding boxes of the two cells.
+            real4 delta = blockCenter[x]-blockCenter[y];
+            real4 boxSizea = blockBoundingBox[x];
+            real4 boxSizeb = blockBoundingBox[y];
+#ifdef USE_PERIODIC
+            delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+            delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+            delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+#endif
+            delta.x = max(0.0f, fabs(delta.x)-boxSizea.x-boxSizeb.x);
+            delta.y = max(0.0f, fabs(delta.y)-boxSizea.y-boxSizeb.y);
+            delta.z = max(0.0f, fabs(delta.z)-boxSizea.z-boxSizeb.z);
+            if (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < CUTOFF_SQUARED) {
+                // Add this tile to the buffer.
+                int bufferIndex = valuesInBuffer*GROUP_SIZE+threadIdx.x;
+                valid[bufferIndex] = true;
+                buffer[bufferIndex] = make_ushort2(x, y);
+                valuesInBuffer++;
+                if (!bufferFull && valuesInBuffer == BUFFER_GROUPS)
+                    bufferFull = true;
+            }
+        }
+        __syncthreads();
+        if (bufferFull) {
+            storeInteractionData(buffer, valid, sum, temp, &globalIndex, interactionCount, interactingTiles, periodicBoxSize, invPeriodicBoxSize, posq, blockCenter, blockBoundingBox, maxTiles);
+            valuesInBuffer = 0;
+            if (threadIdx.x == 0)
+                bufferFull = false;
+            __syncthreads();
+        }
+    }
+    storeInteractionData(buffer, valid, sum, temp, &globalIndex, interactionCount, interactingTiles, periodicBoxSize, invPeriodicBoxSize, posq, blockCenter, blockBoundingBox, maxTiles);
+}
+/**
+ * Compare each atom in one block to the bounding box of another block, and set
+ * flags for which ones are interacting.
+ */
+extern "C" __global__ void findInteractionsWithinBlocks(real4 periodicBoxSize, real4 invPeriodicBoxSize, const real4* __restrict__ posq, const ushort2* __restrict__ tiles, const real4* __restrict__ blockCenter,
+            const real4* __restrict__ blockBoundingBox, unsigned int* __restrict__ interactionFlags, const unsigned int* __restrict__ interactionCount, unsigned int maxTiles) {
+    unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
+    unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
+    unsigned int numTiles = interactionCount[0];
+    unsigned int pos = warp*numTiles/totalWarps;
+    unsigned int end = (warp+1)*numTiles/totalWarps;
+    unsigned int index = threadIdx.x & (TILE_SIZE - 1);
+#if (__CUDA_ARCH__ < 200)
+    __shared__ unsigned int flags[128];
+#endif
+    if (numTiles > maxTiles)
+        return;
+    unsigned int lasty = 0xFFFFFFFF;
+    real4 apos;
+    while (pos < end) {
+        // Extract the coordinates of this tile
+        ushort2 tileIndices = tiles[pos];
+        unsigned int x = tileIndices.x;
+        unsigned int y = tileIndices.y;
+        if (x == y) {
+            if (index == 0)
+                interactionFlags[pos] = 0xFFFFFFFF;
+        }
+        else {
+            // Load the bounding box for x and the atom positions for y.
+            real4 center = blockCenter[x];
+            real4 boxSize = blockBoundingBox[x];
+            if (y != lasty)
+                apos = posq[y*TILE_SIZE+index];
+            // Find the distance of the atom from the bounding box.
+            real4 delta = apos-center;
+#ifdef USE_PERIODIC
+                delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+#endif
+            delta.x = max((real) 0, fabs(delta.x)-boxSize.x);
+            delta.y = max((real) 0, fabs(delta.y)-boxSize.y);
+            delta.z = max((real) 0, fabs(delta.z)-boxSize.z);
+#if (__CUDA_ARCH__ < 200)
+            flags[threadIdx.x] = (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z > CUTOFF_SQUARED ? 0 : 1 << index);
+            if (index % 4 == 0)
+                flags[threadIdx.x] += flags[threadIdx.x+1]+flags[threadIdx.x+2]+flags[threadIdx.x+3];
+            unsigned int allFlags = 0;
+            if (index == 0)
+                allFlags = flags[threadIdx.x]+flags[threadIdx.x+4]+flags[threadIdx.x+8]+flags[threadIdx.x+12]+flags[threadIdx.x+16]+flags[threadIdx.x+20]+flags[threadIdx.x+24]+flags[threadIdx.x+28];
+#else
+            unsigned int allFlags = __ballot(delta.x*delta.x+delta.y*delta.y+delta.z*delta.z > CUTOFF_SQUARED);
+#endif
+            // Sum the flags.
+            if (index == 0) {
+                // Count how many flags are set, and based on that decide whether to compute all interactions
+                // or only a fraction of them.
+                unsigned int bits = (allFlags&0x55555555) + ((allFlags>>1)&0x55555555);
+                bits = (bits&0x33333333) + ((bits>>2)&0x33333333);
+                bits = (bits&0x0F0F0F0F) + ((bits>>4)&0x0F0F0F0F);
+                bits = (bits&0x00FF00FF) + ((bits>>8)&0x00FF00FF);
+                bits = (bits&0x0000FFFF) + ((bits>>16)&0x0000FFFF);
+                interactionFlags[pos] = (bits > 12 ? 0xFFFFFFFF : allFlags);
+            }
+            lasty = y;
+        }
+        pos++;
+    }
+}
--- a/platforms/cuda2/src/kernels/nonbonded.cu
+++ b/platforms/cuda2/src/kernels/nonbonded.cu
+#define TILE_SIZE 32
+#define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)
+typedef struct {
+    real x, y, z;
+    real q;
+    real fx, fy, fz;
+    ATOM_PARAMETER_DATA
+#ifndef PARAMETER_SIZE_IS_EVEN
+    real padding;
+#endif
+} AtomData;
+/**
+ * Compute nonbonded interactions.
+ */
+extern "C" __global__ void computeNonbonded(
+        unsigned long long* __restrict__ forceBuffers, real* __restrict__ energyBuffer, const real4* __restrict__ posq, const unsigned int* __restrict__ exclusions,
+        const unsigned int* __restrict__ exclusionIndices, const unsigned int* __restrict__ exclusionRowIndices,
+        unsigned int startTileIndex, unsigned int numTileIndices
+#ifdef USE_CUTOFF
+        , const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const unsigned int* __restrict__ interactionFlags
+#endif
+        PARAMETER_ARGUMENTS) {
+    unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
+    unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
+#ifdef USE_CUTOFF
+    const unsigned int numTiles = interactionCount[0];
+    unsigned int pos = (numTiles > maxTiles ? startTileIndex+warp*numTileIndices/totalWarps : warp*numTiles/totalWarps);
+    unsigned int end = (numTiles > maxTiles ? startTileIndex+(warp+1)*numTileIndices/totalWarps : (warp+1)*numTiles/totalWarps);
+#else
+    const unsigned int numTiles = numTileIndices;
+    unsigned int pos = startTileIndex+warp*numTiles/totalWarps;
+    unsigned int end = startTileIndex+(warp+1)*numTiles/totalWarps;
+#endif
+    real energy = 0.0f;
+    __shared__ AtomData localData[THREAD_BLOCK_SIZE];
+    __shared__ real tempBuffer[3*THREAD_BLOCK_SIZE];
+    __shared__ unsigned int exclusionRange[2*WARPS_PER_GROUP];
+    __shared__ int exclusionIndex[WARPS_PER_GROUP];
+    do {
+        // Extract the coordinates of this tile
+        const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
+        const unsigned int tbx = threadIdx.x - tgx;
+        const unsigned int localGroupIndex = threadIdx.x/TILE_SIZE;
+        unsigned int x, y;
+        real3 force = make_real3(0);
+        if (pos < end) {
+#ifdef USE_CUTOFF
+            if (numTiles <= maxTiles) {
+                ushort2 tileIndices = tiles[pos];
+                x = tileIndices.x;
+                y = tileIndices.y;
+            }
+            else
+#endif
+            {
+                y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
+                x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+                if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
+                    y += (x < y ? -1 : 1);
+                    x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+                }
+            }
+            unsigned int atom1 = x*TILE_SIZE + tgx;
+            real4 posq1 = posq[atom1];
+            LOAD_ATOM1_PARAMETERS
+            // Locate the exclusion data for this tile.
+#ifdef USE_EXCLUSIONS
+            if (tgx < 2)
+                exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
+            if (tgx == 0)
+                exclusionIndex[localGroupIndex] = -1;
+            for (unsigned int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
+                if (exclusionIndices[i] == y)
+                    exclusionIndex[localGroupIndex] = i*TILE_SIZE;
+            bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
+#else
+            bool hasExclusions = false;
+#endif
+            if (pos >= end)
+                ; // This warp is done.
+            else if (x == y) {
+                // This tile is on the diagonal.
+                const unsigned int localAtomIndex = threadIdx.x;
+                localData[localAtomIndex].x = posq1.x;
+                localData[localAtomIndex].y = posq1.y;
+                localData[localAtomIndex].z = posq1.z;
+                localData[localAtomIndex].q = posq1.w;
+                LOAD_LOCAL_PARAMETERS_FROM_1
+#ifdef USE_EXCLUSIONS
+                unsigned int excl = exclusions[exclusionIndex[localGroupIndex]+tgx];
+#endif
+                for (unsigned int j = 0; j < TILE_SIZE; j++) {
+#ifdef USE_EXCLUSIONS
+                    bool isExcluded = !(excl & 0x1);
+#endif
+                    int atom2 = tbx+j;
+                    real4 posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
+                    real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
+#ifdef USE_PERIODIC
+                    delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                    delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                    delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+#endif
+                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+                    real invR = RSQRT(r2);
+                    real r = RECIP(invR);
+                    LOAD_ATOM2_PARAMETERS
+                    atom2 = y*TILE_SIZE+j;
+#ifdef USE_SYMMETRIC
+                    real dEdR = 0.0f;
+#else
+                    real3 dEdR1 = make_real3(0);
+                    real3 dEdR2 = make_real3(0);
+#endif
+                    real tempEnergy = 0.0f;
+                    COMPUTE_INTERACTION
+                    energy += 0.5f*tempEnergy;
+#ifdef USE_SYMMETRIC
+                    force -= delta*dEdR;
+#else
+                    force -= dEdR1;
+#endif
+#ifdef USE_EXCLUSIONS
+                    excl >>= 1;
+#endif
+                }
+            }
+            else {
+                // This is an off-diagonal tile.
+                const unsigned int localAtomIndex = threadIdx.x;
+                unsigned int j = y*TILE_SIZE + tgx;
+                real4 tempPosq = posq[j];
+                localData[localAtomIndex].x = tempPosq.x;
+                localData[localAtomIndex].y = tempPosq.y;
+                localData[localAtomIndex].z = tempPosq.z;
+                localData[localAtomIndex].q = tempPosq.w;
+                LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
+                localData[localAtomIndex].fx = 0.0f;
+                localData[localAtomIndex].fy = 0.0f;
+                localData[localAtomIndex].fz = 0.0f;
+#ifdef USE_CUTOFF
+                unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
+                if (!hasExclusions && flags != 0xFFFFFFFF) {
+                    if (flags == 0) {
+                        // No interactions in this tile.
+                    }
+                    else {
+                        // Compute only a subset of the interactions in this tile.
+                        for (j = 0; j < TILE_SIZE; j++) {
+                            if ((flags&(1<<j)) != 0) {
+                                bool isExcluded = false;
+                                int atom2 = tbx+j;
+                                int bufferIndex = 3*threadIdx.x;
+#ifdef USE_SYMMETRIC
+                                real dEdR = 0;
+#else
+                                real3 dEdR1 = make_real3(0);
+                                real3 dEdR2 = make_real3(0);
+#endif
+                                real tempEnergy = 0.0f;
+                                real4 posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
+                                real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
+#ifdef USE_PERIODIC
+                                delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                                delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                                delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+#endif
+                                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+#ifdef USE_CUTOFF
+                                if (r2 < CUTOFF_SQUARED) {
+#endif
+                                    real invR = RSQRT(r2);
+                                    real r = RECIP(invR);
+                                    LOAD_ATOM2_PARAMETERS
+                                    atom2 = y*TILE_SIZE+j;
+                                    COMPUTE_INTERACTION
+                                    energy += tempEnergy;
+#ifdef USE_CUTOFF
+                                }
+#endif
+#ifdef USE_SYMMETRIC
+                                delta *= dEdR;
+                                force -= delta;
+                                tempBuffer[bufferIndex] = delta.x;
+                                tempBuffer[bufferIndex+1] = delta.y;
+                                tempBuffer[bufferIndex+2] = delta.z;
+#else
+                                force -= dEdR1;
+                                tempBuffer[bufferIndex] = dEdR2.x;
+                                tempBuffer[bufferIndex+1] = dEdR2.y;
+                                tempBuffer[bufferIndex+2] = dEdR2.z;
+#endif
+                                // Sum the forces on atom2.
+                                if (tgx % 4 == 0) {
+                                    tempBuffer[bufferIndex] += tempBuffer[bufferIndex+3]+tempBuffer[bufferIndex+6]+tempBuffer[bufferIndex+9];
+                                    tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+4]+tempBuffer[bufferIndex+7]+tempBuffer[bufferIndex+10];
+                                    tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+5]+tempBuffer[bufferIndex+8]+tempBuffer[bufferIndex+11];
+                                }
+                                if (tgx == 0) {
+                                    localData[tbx+j].fx += tempBuffer[bufferIndex]+tempBuffer[bufferIndex+12]+tempBuffer[bufferIndex+24]+tempBuffer[bufferIndex+36]+tempBuffer[bufferIndex+48]+tempBuffer[bufferIndex+60]+tempBuffer[bufferIndex+72]+tempBuffer[bufferIndex+84];
+                                    localData[tbx+j].fy += tempBuffer[bufferIndex+1]+tempBuffer[bufferIndex+13]+tempBuffer[bufferIndex+25]+tempBuffer[bufferIndex+37]+tempBuffer[bufferIndex+49]+tempBuffer[bufferIndex+61]+tempBuffer[bufferIndex+73]+tempBuffer[bufferIndex+85];
+                                    localData[tbx+j].fz += tempBuffer[bufferIndex+2]+tempBuffer[bufferIndex+14]+tempBuffer[bufferIndex+26]+tempBuffer[bufferIndex+38]+tempBuffer[bufferIndex+50]+tempBuffer[bufferIndex+62]+tempBuffer[bufferIndex+74]+tempBuffer[bufferIndex+86];
+                                }
+                            }
+                        }
+                    }
+                }
+                else
+#endif
+                {
+                    // Compute the full set of interactions in this tile.
+#ifdef USE_EXCLUSIONS
+                    unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[localGroupIndex]+tgx] : 0xFFFFFFFF);
+                    excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
+#endif
+                    unsigned int tj = tgx;
+                    for (j = 0; j < TILE_SIZE; j++) {
+#ifdef USE_EXCLUSIONS
+                        bool isExcluded = !(excl & 0x1);
+#endif
+                        int atom2 = tbx+tj;
+                        real4 posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
+                        real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
+#ifdef USE_PERIODIC
+                        delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                        delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                        delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+#endif
+                        real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+#ifdef USE_CUTOFF
+                        if (r2 < CUTOFF_SQUARED) {
+#endif
+                            real invR = RSQRT(r2);
+                            real r = RECIP(invR);
+                            LOAD_ATOM2_PARAMETERS
+                            atom2 = y*TILE_SIZE+tj;
+#ifdef USE_SYMMETRIC
+                            real dEdR = 0.0f;
+#else
+                            real3 dEdR1 = make_real3(0);
+                            real3 dEdR2 = make_real3(0);
+#endif
+                            real tempEnergy = 0.0f;
+                            COMPUTE_INTERACTION
+                            energy += tempEnergy;
+#ifdef USE_SYMMETRIC
+                            delta *= dEdR;
+                            force -= delta;
+                            localData[tbx+tj].fx += delta.x;
+                            localData[tbx+tj].fy += delta.y;
+                            localData[tbx+tj].fz += delta.z;
+#else
+                            force -= dEdR1;
+                            localData[tbx+tj].fx += dEdR2.x;
+                            localData[tbx+tj].fy += dEdR2.y;
+                            localData[tbx+tj].fz += dEdR2.z;
+#endif
+#ifdef USE_CUTOFF
+                        }
+#endif
+#ifdef USE_EXCLUSIONS
+                        excl >>= 1;
+#endif
+                        tj = (tj + 1) & (TILE_SIZE - 1);
+                    }
+                }
+            }
+        }
+        // Write results.
+        if (pos < end) {
+            const unsigned int offset = x*TILE_SIZE + tgx;
+            atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (force.x*0xFFFFFFFF)));
+            atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0xFFFFFFFF)));
+            atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0xFFFFFFFF)));
+            __threadfence_block();
+        }
+        if (pos < end && x != y) {
+            const unsigned int offset = y*TILE_SIZE + tgx;
+            atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fx*0xFFFFFFFF)));
+            atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fy*0xFFFFFFFF)));
+            atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fz*0xFFFFFFFF)));
+            __threadfence_block();
+        }
+        pos++;
+    } while (pos < end);
+    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
+}
--- a/platforms/cuda2/src/kernels/nonbondedExceptions.cu
+++ b/platforms/cuda2/src/kernels/nonbondedExceptions.cu
+real4 exceptionParams = PARAMS[index];
+real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
+real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+real invR = RSQRT(r2);
+real sig2 = invR*exceptionParams.y;
+sig2 *= sig2;
+real sig6 = sig2*sig2*sig2;
+real dEdR = exceptionParams.z*(12.0f*sig6-6.0f)*sig6;
+real tempEnergy = exceptionParams.z*(sig6-1.0f)*sig6;
+dEdR += exceptionParams.x*invR;
+dEdR *= invR*invR;
+tempEnergy += exceptionParams.x*invR;
+energy += tempEnergy;
+delta *= dEdR;
+real3 force1 = -delta;
+real3 force2 = delta;
--- a/platforms/cuda2/src/kernels/pme.cu
+++ b/platforms/cuda2/src/kernels/pme.cu
+extern "C" __global__ void updateBsplines(const real4* __restrict__ posq, real4* __restrict__ pmeBsplineTheta, int2* __restrict__ pmeAtomGridIndex,
+            real4 periodicBoxSize, real4 invPeriodicBoxSize) {
+    extern __shared__ real3 bsplinesCache[];
+    real3* data = &bsplinesCache[threadIdx.x*PME_ORDER];
+    const real3 scale = make_real3(RECIP(PME_ORDER-1));
+    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
+        real4 pos = posq[i];
+        pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
+        pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
+        pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
+        real3 t = make_real3((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X,
+                             (pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y,
+                             (pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z);
+        real3 dr = make_real3(t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z);
+        int3 gridIndex = make_int3(((int) t.x) % GRID_SIZE_X,
+                                 ((int) t.y) % GRID_SIZE_Y,
+                                 ((int) t.z) % GRID_SIZE_Z);
+        pmeAtomGridIndex[i] = make_int2(i, gridIndex.x*GRID_SIZE_Y*GRID_SIZE_Z+gridIndex.y*GRID_SIZE_Z+gridIndex.z);
+        data[PME_ORDER-1] = make_real3(0);
+        data[1] = dr;
+        data[0] = make_real3(1)-dr;
+        for (int j = 3; j < PME_ORDER; j++) {
+            real div = RECIP(j-1);
+            data[j-1] = div*dr*data[j-2];
+            for (int k = 1; k < (j-1); k++)
+                data[j-k-1] = div*((dr+make_real3(k)) *data[j-k-2] + (make_real3(j-k)-dr)*data[j-k-1]);
+            data[0] = div*(make_real3(1)-dr)*data[0];
+        }
+        data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
+        for (int j = 1; j < (PME_ORDER-1); j++)
+            data[PME_ORDER-j-1] = scale*((dr+make_real3(j))*data[PME_ORDER-j-2] + (make_real3(PME_ORDER-j)-dr)*data[PME_ORDER-j-1]);
+        data[0] = scale*(make_real3(1)-dr)*data[0];
+        for (int j = 0; j < PME_ORDER; j++)
+            pmeBsplineTheta[i+j*NUM_ATOMS] = make_real4(data[j].x, data[j].y, data[j].z, pos.w);  // Storing the charge here improves cache coherency in the charge spreading kernel
+    }
+}
+/**
+ * For each grid point, find the range of sorted atoms associated with that point.
+ */
+extern "C" __global__ void findAtomRangeForGrid(int2* __restrict__ pmeAtomGridIndex, int* __restrict__ pmeAtomRange, const real4* __restrict__ posq, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
+    int start = (NUM_ATOMS*(blockIdx.x*blockDim.x+threadIdx.x))/(blockDim.x*gridDim.x);
+    int end = (NUM_ATOMS*(blockIdx.x*blockDim.x+threadIdx.x+1))/(blockDim.x*gridDim.x);
+    int last = (start == 0 ? -1 : pmeAtomGridIndex[start-1].y);
+    for (int i = start; i < end; ++i) {
+        int2 atomData = pmeAtomGridIndex[i];
+        int gridIndex = atomData.y;
+        if (gridIndex != last) {
+            for (int j = last+1; j <= gridIndex; ++j)
+                pmeAtomRange[j] = i;
+            last = gridIndex;
+        }
+    }
+    // Fill in values beyond the last atom.
+    if (blockIdx.x == gridDim.x-1 && threadIdx.x == blockDim.x-1) {
+        int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
+        for (int j = last+1; j <= gridSize; ++j)
+            pmeAtomRange[j] = NUM_ATOMS;
+    }
+}
+#define BUFFER_SIZE (PME_ORDER*PME_ORDER*PME_ORDER)
+extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, unsigned long long* __restrict__ pmeGrid, const real4* __restrict__ pmeBsplineTheta, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
+    int ix = threadIdx.x/(PME_ORDER*PME_ORDER);
+    int remainder = threadIdx.x-ix*PME_ORDER*PME_ORDER;
+    int iy = remainder/PME_ORDER;
+    int iz = remainder-iy*PME_ORDER;
+    __shared__ real4 theta[PME_ORDER];
+    __shared__ real charge[BUFFER_SIZE];
+    __shared__ int basex[BUFFER_SIZE];
+    __shared__ int basey[BUFFER_SIZE];
+    __shared__ int basez[BUFFER_SIZE];
+    if (ix < PME_ORDER) {
+        for (int baseIndex = blockIdx.x*BUFFER_SIZE; baseIndex < NUM_ATOMS; baseIndex += gridDim.x*BUFFER_SIZE) {
+            // Load the next block of atoms into the buffers.
+            if (threadIdx.x < BUFFER_SIZE) {
+                int atomIndex = baseIndex+threadIdx.x;
+                if (atomIndex < NUM_ATOMS) {
+                    real4 pos = posq[atomIndex];
+                    charge[threadIdx.x] = pos.w;
+                    pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
+                    pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
+                    pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
+                    basex[threadIdx.x] = (int) ((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X);
+                    basey[threadIdx.x] = (int) ((pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y);
+                    basez[threadIdx.x] = (int) ((pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z);
+                }
+            }
+            __syncthreads();
+            int lastIndex = min(BUFFER_SIZE, NUM_ATOMS-baseIndex);
+            for (int index = 0; index < lastIndex; index++) {
+                int atomIndex = index+baseIndex;
+                if (threadIdx.x < PME_ORDER)
+                    theta[threadIdx.x] = pmeBsplineTheta[atomIndex+threadIdx.x*NUM_ATOMS];
+                __syncthreads();
+                real add = charge[index]*theta[ix].x*theta[iy].y*theta[iz].z;
+                int x = basex[index]+ix;
+                int y = basey[index]+iy;
+                int z = basez[index]+iz;
+                x -= (x >= GRID_SIZE_X ? GRID_SIZE_X : 0);
+                y -= (y >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
+                z -= (z >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
+#ifdef USE_DOUBLE_PRECISION
+                atomicAdd(&pmeGrid[2*(x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z)],  static_cast<unsigned long long>((long long) (add*0xFFFFFFFF)));
+#else
+                atomicAdd(&pmeGrid[x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z],  static_cast<unsigned long long>((long long) (add*0xFFFFFFFF)));
+#endif
+            }
+        }
+    }
+}
+extern "C" __global__ void finishSpreadCharge(long long* __restrict__ pmeGrid) {
+    real2* floatGrid = (real2*) pmeGrid;
+    const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
+    real scale = EPSILON_FACTOR/(real) 0xFFFFFFFF;
+    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x) {
+#ifdef USE_DOUBLE_PRECISION
+        long long value = pmeGrid[2*index];
+#else
+        long long value = pmeGrid[index];
+#endif
+        real2 floatValue = make_real2((real) (value*scale), 0);
+        floatGrid[index] = floatValue;
+    }
+}
+extern "C" __global__ void reciprocalConvolution(real2* __restrict__ pmeGrid, real* __restrict__ energyBuffer, const real* __restrict__ pmeBsplineModuliX,
+        const real* __restrict__ pmeBsplineModuliY, const real* __restrict__ pmeBsplineModuliZ, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
+    const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
+    const real recipScaleFactor = RECIP(M_PI*periodicBoxSize.x*periodicBoxSize.y*periodicBoxSize.z);
+    real energy = 0;
+    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x) {
+        int kx = index/(GRID_SIZE_Y*GRID_SIZE_Z);
+        int remainder = index-kx*GRID_SIZE_Y*GRID_SIZE_Z;
+        int ky = remainder/GRID_SIZE_Z;
+        int kz = remainder-ky*GRID_SIZE_Z;
+        if (kx == 0 && ky == 0 && kz == 0)
+            continue;
+        int mx = (kx < (GRID_SIZE_X+1)/2) ? kx : (kx-GRID_SIZE_X);
+        int my = (ky < (GRID_SIZE_Y+1)/2) ? ky : (ky-GRID_SIZE_Y);
+        int mz = (kz < (GRID_SIZE_Z+1)/2) ? kz : (kz-GRID_SIZE_Z);
+        real mhx = mx*invPeriodicBoxSize.x;
+        real mhy = my*invPeriodicBoxSize.y;
+        real mhz = mz*invPeriodicBoxSize.z;
+        real bx = pmeBsplineModuliX[kx];
+        real by = pmeBsplineModuliY[ky];
+        real bz = pmeBsplineModuliZ[kz];
+        real2 grid = pmeGrid[index];
+        real m2 = mhx*mhx+mhy*mhy+mhz*mhz;
+        real denom = m2*bx*by*bz;
+        real eterm = recipScaleFactor*EXP(-RECIP_EXP_FACTOR*m2)/denom;
+        pmeGrid[index] = make_real2(grid.x*eterm, grid.y*eterm);
+        energy += eterm*(grid.x*grid.x + grid.y*grid.y);
+    }
+    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += 0.5f*energy;
+}
+extern "C" __global__ void gridInterpolateForce(const real4* __restrict__ posq, unsigned long long* __restrict__ forceBuffers, const real2* __restrict__ pmeGrid,
+        real4 periodicBoxSize, real4 invPeriodicBoxSize) {
+    extern __shared__ real3 bsplinesCache[];
+    real3* data = &bsplinesCache[threadIdx.x*PME_ORDER];
+    real3* ddata = &bsplinesCache[threadIdx.x*PME_ORDER + blockDim.x*PME_ORDER];
+    const real scale = RECIP(PME_ORDER-1);
+    for (int atom = blockIdx.x*blockDim.x+threadIdx.x; atom < NUM_ATOMS; atom += blockDim.x*gridDim.x) {
+        real4 force = make_real4(0);
+        real4 pos = posq[atom];
+        pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
+        pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
+        pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
+        real3 t = make_real3((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X,
+                             (pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y,
+                             (pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z);
+        int3 gridIndex = make_int3(((int) t.x) % GRID_SIZE_X,
+                                 ((int) t.y) % GRID_SIZE_Y,
+                                 ((int) t.z) % GRID_SIZE_Z);
+        // Since we need the full set of thetas, it's faster to compute them here than load them
+        // from global memory.
+        real3 dr = make_real3(t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z);
+        data[PME_ORDER-1] = make_real3(0);
+        data[1] = dr;
+        data[0] = make_real3(1)-dr;
+        for (int j = 3; j < PME_ORDER; j++) {
+            real div = RECIP(j-1);
+            data[j-1] = div*dr*data[j-2];
+            for (int k = 1; k < (j-1); k++)
+                data[j-k-1] = div*((dr+make_real3(k))*data[j-k-2] + (make_real3(j-k)-dr)*data[j-k-1]);
+            data[0] = div*(make_real3(1)-dr)*data[0];
+        }
+        ddata[0] = -data[0];
+        for (int j = 1; j < PME_ORDER; j++)
+            ddata[j] = data[j-1]-data[j];
+        data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
+        for (int j = 1; j < (PME_ORDER-1); j++)
+            data[PME_ORDER-j-1] = scale*((dr+make_real3(j))*data[PME_ORDER-j-2] + (make_real3(PME_ORDER-j)-dr)*data[PME_ORDER-j-1]);
+        data[0] = scale*(make_real3(1)-dr)*data[0];
+        // Compute the force on this atom.
+        for (int ix = 0; ix < PME_ORDER; ix++) {
+            int xindex = gridIndex.x+ix;
+            xindex -= (xindex >= GRID_SIZE_X ? GRID_SIZE_X : 0);
+            for (int iy = 0; iy < PME_ORDER; iy++) {
+                int yindex = gridIndex.y+iy;
+                yindex -= (yindex >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
+                for (int iz = 0; iz < PME_ORDER; iz++) {
+                    int zindex = gridIndex.z+iz;
+                    zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
+                    int index = xindex*GRID_SIZE_Y*GRID_SIZE_Z + yindex*GRID_SIZE_Z + zindex;
+                    real gridvalue = pmeGrid[index].x;
+                    force.x += ddata[ix].x*data[iy].y*data[iz].z*gridvalue;
+                    force.y += data[ix].x*ddata[iy].y*data[iz].z*gridvalue;
+                    force.z += data[ix].x*data[iy].y*ddata[iz].z*gridvalue;
+                }
+            }
+        }
+        real q = pos.w*EPSILON_FACTOR;
+        atomicAdd(&forceBuffers[atom], static_cast<unsigned long long>((long long) (-q*force.x*GRID_SIZE_X*invPeriodicBoxSize.x*0xFFFFFFFF)));
+        atomicAdd(&forceBuffers[atom+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (-q*force.y*GRID_SIZE_Y*invPeriodicBoxSize.y*0xFFFFFFFF)));
+        atomicAdd(&forceBuffers[atom+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (-q*force.z*GRID_SIZE_Z*invPeriodicBoxSize.z*0xFFFFFFFF)));
+    }
+}