Fixed bug using multiple devices with PME

f1991b3c · peastman · 5ed9dd65 · f1991b3c · f1991b3c · f1991b3c
Commit f1991b3c authored May 06, 2014 by peastman
4 changed files
--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
@@ -1520,7 +1520,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
    else
        dispersionCoefficient = 0.0;
    alpha = 0;
-    if (force.getNonbondedMethod() == NonbondedForce::Ewald && cu.getContextIndex() == 0) {
+    if (force.getNonbondedMethod() == NonbondedForce::Ewald) {
        // Compute the Ewald parameters.
        int kmaxx, kmaxy, kmaxz;
@@ -1528,26 +1528,28 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
        defines["EWALD_ALPHA"] = cu.doubleToString(alpha);
        defines["TWO_OVER_SQRT_PI"] = cu.doubleToString(2.0/sqrt(M_PI));
        defines["USE_EWALD"] = "1";
-        ewaldSelfEnergy = -ONE_4PI_EPS0*alpha*sumSquaredCharges/sqrt(M_PI);
+        if (cu.getContextIndex() == 0) {
+            ewaldSelfEnergy = -ONE_4PI_EPS0*alpha*sumSquaredCharges/sqrt(M_PI);
-        // Create the reciprocal space kernels.
+            // Create the reciprocal space kernels.
-        map<string, string> replacements;
-        replacements["NUM_ATOMS"] = cu.intToString(numParticles);
+            map<string, string> replacements;
-        replacements["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
+            replacements["NUM_ATOMS"] = cu.intToString(numParticles);
-        replacements["KMAX_X"] = cu.intToString(kmaxx);
+            replacements["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
-        replacements["KMAX_Y"] = cu.intToString(kmaxy);
+            replacements["KMAX_X"] = cu.intToString(kmaxx);
-        replacements["KMAX_Z"] = cu.intToString(kmaxz);
+            replacements["KMAX_Y"] = cu.intToString(kmaxy);
-        replacements["EXP_COEFFICIENT"] = cu.doubleToString(-1.0/(4.0*alpha*alpha));
+            replacements["KMAX_Z"] = cu.intToString(kmaxz);
-        replacements["ONE_4PI_EPS0"] = cu.doubleToString(ONE_4PI_EPS0);
+            replacements["EXP_COEFFICIENT"] = cu.doubleToString(-1.0/(4.0*alpha*alpha));
-        replacements["M_PI"] = cu.doubleToString(M_PI);
+            replacements["ONE_4PI_EPS0"] = cu.doubleToString(ONE_4PI_EPS0);
-        CUmodule module = cu.createModule(CudaKernelSources::vectorOps+CudaKernelSources::ewald, replacements);
+            replacements["M_PI"] = cu.doubleToString(M_PI);
-        ewaldSumsKernel = cu.getKernel(module, "calculateEwaldCosSinSums");
+            CUmodule module = cu.createModule(CudaKernelSources::vectorOps+CudaKernelSources::ewald, replacements);
-        ewaldForcesKernel = cu.getKernel(module, "calculateEwaldForces");
+            ewaldSumsKernel = cu.getKernel(module, "calculateEwaldCosSinSums");
-        int elementSize = (cu.getUseDoublePrecision() ? sizeof(double2) : sizeof(float2));
+            ewaldForcesKernel = cu.getKernel(module, "calculateEwaldForces");
-        cosSinSums = new CudaArray(cu, (2*kmaxx-1)*(2*kmaxy-1)*(2*kmaxz-1), elementSize, "cosSinSums");
+            int elementSize = (cu.getUseDoublePrecision() ? sizeof(double2) : sizeof(float2));
-    }
+            cosSinSums = new CudaArray(cu, (2*kmaxx-1)*(2*kmaxy-1)*(2*kmaxz-1), elementSize, "cosSinSums");
-    else if (force.getNonbondedMethod() == NonbondedForce::PME && cu.getContextIndex() == 0) {
+        }
+    }
+    else if (force.getNonbondedMethod() == NonbondedForce::PME) {
        // Compute the PME parameters.
        int gridSizeX, gridSizeY, gridSizeZ;
@@ -1560,140 +1562,142 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
        defines["EWALD_ALPHA"] = cu.doubleToString(alpha);
        defines["TWO_OVER_SQRT_PI"] = cu.doubleToString(2.0/sqrt(M_PI));
        defines["USE_EWALD"] = "1";
-        ewaldSelfEnergy = -ONE_4PI_EPS0*alpha*sumSquaredCharges/sqrt(M_PI);
+        if (cu.getContextIndex() == 0) {
-        pmeDefines["PME_ORDER"] = cu.intToString(PmeOrder);
+            ewaldSelfEnergy = -ONE_4PI_EPS0*alpha*sumSquaredCharges/sqrt(M_PI);
-        pmeDefines["NUM_ATOMS"] = cu.intToString(numParticles);
+            pmeDefines["PME_ORDER"] = cu.intToString(PmeOrder);
-        pmeDefines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
+            pmeDefines["NUM_ATOMS"] = cu.intToString(numParticles);
-        pmeDefines["RECIP_EXP_FACTOR"] = cu.doubleToString(M_PI*M_PI/(alpha*alpha));
+            pmeDefines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
-        pmeDefines["GRID_SIZE_X"] = cu.intToString(gridSizeX);
+            pmeDefines["RECIP_EXP_FACTOR"] = cu.doubleToString(M_PI*M_PI/(alpha*alpha));
-        pmeDefines["GRID_SIZE_Y"] = cu.intToString(gridSizeY);
+            pmeDefines["GRID_SIZE_X"] = cu.intToString(gridSizeX);
-        pmeDefines["GRID_SIZE_Z"] = cu.intToString(gridSizeZ);
+            pmeDefines["GRID_SIZE_Y"] = cu.intToString(gridSizeY);
-        pmeDefines["EPSILON_FACTOR"] = cu.doubleToString(sqrt(ONE_4PI_EPS0));
+            pmeDefines["GRID_SIZE_Z"] = cu.intToString(gridSizeZ);
-        pmeDefines["M_PI"] = cu.doubleToString(M_PI);
+            pmeDefines["EPSILON_FACTOR"] = cu.doubleToString(sqrt(ONE_4PI_EPS0));
-        if (cu.getUseDoublePrecision())
+            pmeDefines["M_PI"] = cu.doubleToString(M_PI);
-            pmeDefines["USE_DOUBLE_PRECISION"] = "1";
+            if (cu.getUseDoublePrecision())
-        CUmodule module = cu.createModule(CudaKernelSources::vectorOps+CudaKernelSources::pme, pmeDefines);
+                pmeDefines["USE_DOUBLE_PRECISION"] = "1";
-        if (cu.getPlatformData().useCpuPme) {
+            CUmodule module = cu.createModule(CudaKernelSources::vectorOps+CudaKernelSources::pme, pmeDefines);
-            // Create the CPU PME kernel.
+            if (cu.getPlatformData().useCpuPme) {
+                // Create the CPU PME kernel.
-            try {
-                cpuPme = getPlatform().createKernel(CalcPmeReciprocalForceKernel::Name(), *cu.getPlatformData().context);
+                try {
-                cpuPme.getAs<CalcPmeReciprocalForceKernel>().initialize(gridSizeX, gridSizeY, gridSizeZ, numParticles, alpha);
+                    cpuPme = getPlatform().createKernel(CalcPmeReciprocalForceKernel::Name(), *cu.getPlatformData().context);
-                CUfunction addForcesKernel = cu.getKernel(module, "addForces");
+                    cpuPme.getAs<CalcPmeReciprocalForceKernel>().initialize(gridSizeX, gridSizeY, gridSizeZ, numParticles, alpha);
-                pmeio = new PmeIO(cu, addForcesKernel);
+                    CUfunction addForcesKernel = cu.getKernel(module, "addForces");
-                cu.addPreComputation(new PmePreComputation(cu, cpuPme, *pmeio));
+                    pmeio = new PmeIO(cu, addForcesKernel);
-                cu.addPostComputation(new PmePostComputation(cpuPme, *pmeio));
+                    cu.addPreComputation(new PmePreComputation(cu, cpuPme, *pmeio));
-            }
+                    cu.addPostComputation(new PmePostComputation(cpuPme, *pmeio));
-            catch (OpenMMException& ex) {
+                }
-                // The CPU PME plugin isn't available.
+                catch (OpenMMException& ex) {
-            }
+                    // The CPU PME plugin isn't available.
-        }
+                }
-        if (pmeio == NULL) {
-            pmeGridIndexKernel = cu.getKernel(module, "findAtomGridIndex");
-            pmeSpreadChargeKernel = cu.getKernel(module, "gridSpreadCharge");
-            pmeConvolutionKernel = cu.getKernel(module, "reciprocalConvolution");
-            pmeInterpolateForceKernel = cu.getKernel(module, "gridInterpolateForce");
-            pmeEvalEnergyKernel = cu.getKernel(module, "gridEvaluateEnergy");
-            pmeFinishSpreadChargeKernel = cu.getKernel(module, "finishSpreadCharge");
-            cuFuncSetCacheConfig(pmeSpreadChargeKernel, CU_FUNC_CACHE_PREFER_L1);
-            cuFuncSetCacheConfig(pmeInterpolateForceKernel, CU_FUNC_CACHE_PREFER_L1);
-            // Create required data structures.
-            int elementSize = (cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
-            directPmeGrid = new CudaArray(cu, gridSizeX*gridSizeY*gridSizeZ, cu.getComputeCapability() >= 2.0 ? elementSize : sizeof(long long), "originalPmeGrid");
-            reciprocalPmeGrid = new CudaArray(cu, gridSizeX*gridSizeY*(gridSizeZ/2+1), 2*elementSize, "reciprocalPmeGrid");
-            cu.addAutoclearBuffer(*directPmeGrid);
-            pmeBsplineModuliX = new CudaArray(cu, gridSizeX, elementSize, "pmeBsplineModuliX");
-            pmeBsplineModuliY = new CudaArray(cu, gridSizeY, elementSize, "pmeBsplineModuliY");
-            pmeBsplineModuliZ = new CudaArray(cu, gridSizeZ, elementSize, "pmeBsplineModuliZ");
-            pmeAtomRange = CudaArray::create<int>(cu, gridSizeX*gridSizeY*gridSizeZ+1, "pmeAtomRange");
-            pmeAtomGridIndex = CudaArray::create<int2>(cu, numParticles, "pmeAtomGridIndex");
-            sort = new CudaSort(cu, new SortTrait(), cu.getNumAtoms());
-            cufftResult result = cufftPlan3d(&fftForward, gridSizeX, gridSizeY, gridSizeZ, cu.getUseDoublePrecision() ? CUFFT_D2Z : CUFFT_R2C);
-            if (result != CUFFT_SUCCESS)
-                throw OpenMMException("Error initializing FFT: "+cu.intToString(result));
-            result = cufftPlan3d(&fftBackward, gridSizeX, gridSizeY, gridSizeZ, cu.getUseDoublePrecision() ? CUFFT_Z2D : CUFFT_C2R);
-            if (result != CUFFT_SUCCESS)
-                throw OpenMMException("Error initializing FFT: "+cu.intToString(result));
-            cufftSetCompatibilityMode(fftForward, CUFFT_COMPATIBILITY_NATIVE);
-            cufftSetCompatibilityMode(fftBackward, CUFFT_COMPATIBILITY_NATIVE);
-            hasInitializedFFT = true;
-            // Initialize the b-spline moduli.
-            int maxSize = max(max(gridSizeX, gridSizeY), gridSizeZ);
-            vector<double> data(PmeOrder);
-            vector<double> ddata(PmeOrder);
-            vector<double> bsplines_data(maxSize);
-            data[PmeOrder-1] = 0.0;
-            data[1] = 0.0;
-            data[0] = 1.0;
-            for (int i = 3; i < PmeOrder; i++) {
-                double div = 1.0/(i-1.0);
-                data[i-1] = 0.0;
-                for (int j = 1; j < (i-1); j++)
-                    data[i-j-1] = div*(j*data[i-j-2]+(i-j)*data[i-j-1]);
-                data[0] = div*data[0];
            }
+            if (pmeio == NULL) {
+                pmeGridIndexKernel = cu.getKernel(module, "findAtomGridIndex");
+                pmeSpreadChargeKernel = cu.getKernel(module, "gridSpreadCharge");
+                pmeConvolutionKernel = cu.getKernel(module, "reciprocalConvolution");
+                pmeInterpolateForceKernel = cu.getKernel(module, "gridInterpolateForce");
+                pmeEvalEnergyKernel = cu.getKernel(module, "gridEvaluateEnergy");
+                pmeFinishSpreadChargeKernel = cu.getKernel(module, "finishSpreadCharge");
+                cuFuncSetCacheConfig(pmeSpreadChargeKernel, CU_FUNC_CACHE_PREFER_L1);
+                cuFuncSetCacheConfig(pmeInterpolateForceKernel, CU_FUNC_CACHE_PREFER_L1);
+                // Create required data structures.
+                int elementSize = (cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
+                directPmeGrid = new CudaArray(cu, gridSizeX*gridSizeY*gridSizeZ, cu.getComputeCapability() >= 2.0 ? elementSize : sizeof(long long), "originalPmeGrid");
+                reciprocalPmeGrid = new CudaArray(cu, gridSizeX*gridSizeY*(gridSizeZ/2+1), 2*elementSize, "reciprocalPmeGrid");
+                cu.addAutoclearBuffer(*directPmeGrid);
+                pmeBsplineModuliX = new CudaArray(cu, gridSizeX, elementSize, "pmeBsplineModuliX");
+                pmeBsplineModuliY = new CudaArray(cu, gridSizeY, elementSize, "pmeBsplineModuliY");
+                pmeBsplineModuliZ = new CudaArray(cu, gridSizeZ, elementSize, "pmeBsplineModuliZ");
+                pmeAtomRange = CudaArray::create<int>(cu, gridSizeX*gridSizeY*gridSizeZ+1, "pmeAtomRange");
+                pmeAtomGridIndex = CudaArray::create<int2>(cu, numParticles, "pmeAtomGridIndex");
+                sort = new CudaSort(cu, new SortTrait(), cu.getNumAtoms());
+                cufftResult result = cufftPlan3d(&fftForward, gridSizeX, gridSizeY, gridSizeZ, cu.getUseDoublePrecision() ? CUFFT_D2Z : CUFFT_R2C);
+                if (result != CUFFT_SUCCESS)
+                    throw OpenMMException("Error initializing FFT: "+cu.intToString(result));
+                result = cufftPlan3d(&fftBackward, gridSizeX, gridSizeY, gridSizeZ, cu.getUseDoublePrecision() ? CUFFT_Z2D : CUFFT_C2R);
+                if (result != CUFFT_SUCCESS)
+                    throw OpenMMException("Error initializing FFT: "+cu.intToString(result));
+                cufftSetCompatibilityMode(fftForward, CUFFT_COMPATIBILITY_NATIVE);
+                cufftSetCompatibilityMode(fftBackward, CUFFT_COMPATIBILITY_NATIVE);
+                hasInitializedFFT = true;
+                // Initialize the b-spline moduli.
+                int maxSize = max(max(gridSizeX, gridSizeY), gridSizeZ);
+                vector<double> data(PmeOrder);
+                vector<double> ddata(PmeOrder);
+                vector<double> bsplines_data(maxSize);
+                data[PmeOrder-1] = 0.0;
+                data[1] = 0.0;
+                data[0] = 1.0;
+                for (int i = 3; i < PmeOrder; i++) {
+                    double div = 1.0/(i-1.0);
+                    data[i-1] = 0.0;
+                    for (int j = 1; j < (i-1); j++)
+                        data[i-j-1] = div*(j*data[i-j-2]+(i-j)*data[i-j-1]);
+                    data[0] = div*data[0];
+                }
+                // Differentiate.
-            // Differentiate.
+                ddata[0] = -data[0];
+                for (int i = 1; i < PmeOrder; i++)
-            ddata[0] = -data[0];
+                    ddata[i] = data[i-1]-data[i];
-            for (int i = 1; i < PmeOrder; i++)
+                double div = 1.0/(PmeOrder-1);
-                ddata[i] = data[i-1]-data[i];
+                data[PmeOrder-1] = 0.0;
-            double div = 1.0/(PmeOrder-1);
+                for (int i = 1; i < (PmeOrder-1); i++)
-            data[PmeOrder-1] = 0.0;
+                    data[PmeOrder-i-1] = div*(i*data[PmeOrder-i-2]+(PmeOrder-i)*data[PmeOrder-i-1]);
-            for (int i = 1; i < (PmeOrder-1); i++)
+                data[0] = div*data[0];
-                data[PmeOrder-i-1] = div*(i*data[PmeOrder-i-2]+(PmeOrder-i)*data[PmeOrder-i-1]);
+                for (int i = 0; i < maxSize; i++)
-            data[0] = div*data[0];
+                    bsplines_data[i] = 0.0;
-            for (int i = 0; i < maxSize; i++)
+                for (int i = 1; i <= PmeOrder; i++)
-                bsplines_data[i] = 0.0;
+                    bsplines_data[i] = data[i-1];
-            for (int i = 1; i <= PmeOrder; i++)
-                bsplines_data[i] = data[i-1];
+                // Evaluate the actual bspline moduli for X/Y/Z.
-            // Evaluate the actual bspline moduli for X/Y/Z.
+                for(int dim = 0; dim < 3; dim++) {
+                    int ndata = (dim == 0 ? gridSizeX : dim == 1 ? gridSizeY : gridSizeZ);
-            for(int dim = 0; dim < 3; dim++) {
+                    vector<double> moduli(ndata);
-                int ndata = (dim == 0 ? gridSizeX : dim == 1 ? gridSizeY : gridSizeZ);
+                    for (int i = 0; i < ndata; i++) {
-                vector<double> moduli(ndata);
+                        double sc = 0.0;
-                for (int i = 0; i < ndata; i++) {
+                        double ss = 0.0;
-                    double sc = 0.0;
+                        for (int j = 0; j < ndata; j++) {
-                    double ss = 0.0;
+                            double arg = (2.0*M_PI*i*j)/ndata;
-                    for (int j = 0; j < ndata; j++) {
+                            sc += bsplines_data[j]*cos(arg);
-                        double arg = (2.0*M_PI*i*j)/ndata;
+                            ss += bsplines_data[j]*sin(arg);
-                        sc += bsplines_data[j]*cos(arg);
+                        }
-                        ss += bsplines_data[j]*sin(arg);
+                        moduli[i] = sc*sc+ss*ss;
                    }
-                    moduli[i] = sc*sc+ss*ss;
-                }
-                for (int i = 0; i < ndata; i++)
-                    if (moduli[i] < 1.0e-7)
-                        moduli[i] = (moduli[i-1]+moduli[i+1])*0.5;
-                if (cu.getUseDoublePrecision()) {
-                    if (dim == 0)
-                        pmeBsplineModuliX->upload(moduli);
-                    else if (dim == 1)
-                        pmeBsplineModuliY->upload(moduli);
-                    else
-                        pmeBsplineModuliZ->upload(moduli);
-                }
-                else {
-                    vector<float> modulif(ndata);
                    for (int i = 0; i < ndata; i++)
-                        modulif[i] = (float) moduli[i];
+                        if (moduli[i] < 1.0e-7)
-                    if (dim == 0)
+                            moduli[i] = (moduli[i-1]+moduli[i+1])*0.5;
-                        pmeBsplineModuliX->upload(modulif);
+                    if (cu.getUseDoublePrecision()) {
-                    else if (dim == 1)
+                        if (dim == 0)
-                        pmeBsplineModuliY->upload(modulif);
+                            pmeBsplineModuliX->upload(moduli);
-                    else
+                        else if (dim == 1)
-                        pmeBsplineModuliZ->upload(modulif);
+                            pmeBsplineModuliY->upload(moduli);
+                        else
+                            pmeBsplineModuliZ->upload(moduli);
+                    }
+                    else {
+                        vector<float> modulif(ndata);
+                        for (int i = 0; i < ndata; i++)
+                            modulif[i] = (float) moduli[i];
+                        if (dim == 0)
+                            pmeBsplineModuliX->upload(modulif);
+                        else if (dim == 1)
+                            pmeBsplineModuliY->upload(modulif);
+                        else
+                            pmeBsplineModuliZ->upload(modulif);
+                    }
                }
            }
        }

--- a/platforms/cuda/tests/TestCudaNonbondedForce.cpp
+++ b/platforms/cuda/tests/TestCudaNonbondedForce.cpp
@@ -748,7 +748,7 @@ void testChangingParameters() {
    ASSERT_EQUAL_TOL(cuState.getPotentialEnergy(), referenceState.getPotentialEnergy(), tol);
 }
-void testParallelComputation(bool useCutoff) {
+void testParallelComputation(NonbondedForce::NonbondedMethod method) {
    System system;
    const int numParticles = 200;
    for (int i = 0; i < numParticles; i++)
@@ -756,9 +756,9 @@ void testParallelComputation(bool useCutoff) {
    NonbondedForce* force = new NonbondedForce();
    for (int i = 0; i < numParticles; i++)
        force->addParticle(i%2-0.5, 0.5, 1.0);
-    if (useCutoff)
+    force->setNonbondedMethod(method);
-        force->setNonbondedMethod(NonbondedForce::CutoffNonPeriodic);
    system.addForce(force);
+    system.setDefaultPeriodicBoxVectors(Vec3(5,0,0), Vec3(0,5,0), Vec3(0,0,5));
    OpenMM_SFMT::SFMT sfmt;
    init_gen_rand(0, sfmt);
    vector<Vec3> positions(numParticles);
@@ -877,8 +877,9 @@ int main(int argc, char* argv[]) {
        //testBlockInteractions(true);
        testDispersionCorrection();
        testChangingParameters();
-        testParallelComputation(false);
+        testParallelComputation(NonbondedForce::NoCutoff);
-        testParallelComputation(true);
+        testParallelComputation(NonbondedForce::Ewald);
+        testParallelComputation(NonbondedForce::PME);
        testSwitchingFunction(NonbondedForce::CutoffNonPeriodic);
        testSwitchingFunction(NonbondedForce::PME);
    }

--- a/platforms/opencl/src/OpenCLKernels.cpp
+++ b/platforms/opencl/src/OpenCLKernels.cpp
@@ -1492,7 +1492,7 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
    else
        dispersionCoefficient = 0.0;
    alpha = 0;
-    if (force.getNonbondedMethod() == NonbondedForce::Ewald && cl.getContextIndex() == 0) {
+    if (force.getNonbondedMethod() == NonbondedForce::Ewald) {
        // Compute the Ewald parameters.
        int kmaxx, kmaxy, kmaxz;
@@ -1500,23 +1500,25 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
        defines["EWALD_ALPHA"] = cl.doubleToString(alpha);
        defines["TWO_OVER_SQRT_PI"] = cl.doubleToString(2.0/sqrt(M_PI));
        defines["USE_EWALD"] = "1";
-        ewaldSelfEnergy = -ONE_4PI_EPS0*alpha*sumSquaredCharges/sqrt(M_PI);
+        if (cl.getContextIndex() == 0) {
+            ewaldSelfEnergy = -ONE_4PI_EPS0*alpha*sumSquaredCharges/sqrt(M_PI);
-        // Create the reciprocal space kernels.
+            // Create the reciprocal space kernels.
-        map<string, string> replacements;
+            map<string, string> replacements;
-        replacements["NUM_ATOMS"] = cl.intToString(numParticles);
+            replacements["NUM_ATOMS"] = cl.intToString(numParticles);
-        replacements["KMAX_X"] = cl.intToString(kmaxx);
+            replacements["KMAX_X"] = cl.intToString(kmaxx);
-        replacements["KMAX_Y"] = cl.intToString(kmaxy);
+            replacements["KMAX_Y"] = cl.intToString(kmaxy);
-        replacements["KMAX_Z"] = cl.intToString(kmaxz);
+            replacements["KMAX_Z"] = cl.intToString(kmaxz);
-        replacements["EXP_COEFFICIENT"] = cl.doubleToString(-1.0/(4.0*alpha*alpha));
+            replacements["EXP_COEFFICIENT"] = cl.doubleToString(-1.0/(4.0*alpha*alpha));
-        cl::Program program = cl.createProgram(OpenCLKernelSources::ewald, replacements);
+            cl::Program program = cl.createProgram(OpenCLKernelSources::ewald, replacements);
-        ewaldSumsKernel = cl::Kernel(program, "calculateEwaldCosSinSums");
+            ewaldSumsKernel = cl::Kernel(program, "calculateEwaldCosSinSums");
-        ewaldForcesKernel = cl::Kernel(program, "calculateEwaldForces");
+            ewaldForcesKernel = cl::Kernel(program, "calculateEwaldForces");
-        int elementSize = (cl.getUseDoublePrecision() ? sizeof(mm_double2) : sizeof(mm_float2));
+            int elementSize = (cl.getUseDoublePrecision() ? sizeof(mm_double2) : sizeof(mm_float2));
-        cosSinSums = new OpenCLArray(cl, (2*kmaxx-1)*(2*kmaxy-1)*(2*kmaxz-1), elementSize, "cosSinSums");
+            cosSinSums = new OpenCLArray(cl, (2*kmaxx-1)*(2*kmaxy-1)*(2*kmaxz-1), elementSize, "cosSinSums");
-    }
+        }
-    else if (force.getNonbondedMethod() == NonbondedForce::PME && cl.getContextIndex() == 0) {
+    }
+    else if (force.getNonbondedMethod() == NonbondedForce::PME) {
        // Compute the PME parameters.
        int gridSizeX, gridSizeY, gridSizeZ;
@@ -1527,119 +1529,121 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
        defines["EWALD_ALPHA"] = cl.doubleToString(alpha);
        defines["TWO_OVER_SQRT_PI"] = cl.doubleToString(2.0/sqrt(M_PI));
        defines["USE_EWALD"] = "1";
-        ewaldSelfEnergy = -ONE_4PI_EPS0*alpha*sumSquaredCharges/sqrt(M_PI);
+        if (cl.getContextIndex() == 0) {
-        pmeDefines["PME_ORDER"] = cl.intToString(PmeOrder);
+            ewaldSelfEnergy = -ONE_4PI_EPS0*alpha*sumSquaredCharges/sqrt(M_PI);
-        pmeDefines["NUM_ATOMS"] = cl.intToString(numParticles);
+            pmeDefines["PME_ORDER"] = cl.intToString(PmeOrder);
-        pmeDefines["RECIP_EXP_FACTOR"] = cl.doubleToString(M_PI*M_PI/(alpha*alpha));
+            pmeDefines["NUM_ATOMS"] = cl.intToString(numParticles);
-        pmeDefines["GRID_SIZE_X"] = cl.intToString(gridSizeX);
+            pmeDefines["RECIP_EXP_FACTOR"] = cl.doubleToString(M_PI*M_PI/(alpha*alpha));
-        pmeDefines["GRID_SIZE_Y"] = cl.intToString(gridSizeY);
+            pmeDefines["GRID_SIZE_X"] = cl.intToString(gridSizeX);
-        pmeDefines["GRID_SIZE_Z"] = cl.intToString(gridSizeZ);
+            pmeDefines["GRID_SIZE_Y"] = cl.intToString(gridSizeY);
-        pmeDefines["EPSILON_FACTOR"] = cl.doubleToString(sqrt(ONE_4PI_EPS0));
+            pmeDefines["GRID_SIZE_Z"] = cl.intToString(gridSizeZ);
-        bool deviceIsCpu = (cl.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU);
+            pmeDefines["EPSILON_FACTOR"] = cl.doubleToString(sqrt(ONE_4PI_EPS0));
-        if (deviceIsCpu)
+            bool deviceIsCpu = (cl.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU);
-            pmeDefines["DEVICE_IS_CPU"] = "1";
+            if (deviceIsCpu)
-        if (cl.getPlatformData().useCpuPme) {
+                pmeDefines["DEVICE_IS_CPU"] = "1";
-            // Create the CPU PME kernel.
+            if (cl.getPlatformData().useCpuPme) {
+                // Create the CPU PME kernel.
-            try {
-                cpuPme = getPlatform().createKernel(CalcPmeReciprocalForceKernel::Name(), *cl.getPlatformData().context);
+                try {
-                cpuPme.getAs<CalcPmeReciprocalForceKernel>().initialize(gridSizeX, gridSizeY, gridSizeZ, numParticles, alpha);
+                    cpuPme = getPlatform().createKernel(CalcPmeReciprocalForceKernel::Name(), *cl.getPlatformData().context);
-                cl::Program program = cl.createProgram(OpenCLKernelSources::pme, pmeDefines);
+                    cpuPme.getAs<CalcPmeReciprocalForceKernel>().initialize(gridSizeX, gridSizeY, gridSizeZ, numParticles, alpha);
-                cl::Kernel addForcesKernel = cl::Kernel(program, "addForces");
+                    cl::Program program = cl.createProgram(OpenCLKernelSources::pme, pmeDefines);
-                pmeio = new PmeIO(cl, addForcesKernel);
+                    cl::Kernel addForcesKernel = cl::Kernel(program, "addForces");
-                cl.addPreComputation(new PmePreComputation(cl, cpuPme, *pmeio));
+                    pmeio = new PmeIO(cl, addForcesKernel);
-                cl.addPostComputation(new PmePostComputation(cpuPme, *pmeio));
+                    cl.addPreComputation(new PmePreComputation(cl, cpuPme, *pmeio));
-            }
+                    cl.addPostComputation(new PmePostComputation(cpuPme, *pmeio));
-            catch (OpenMMException& ex) {
-                // The CPU PME plugin isn't available.
-            }
-        }
-        if (pmeio == NULL) {
-            // Create required data structures.
-            int elementSize = (cl.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
-            pmeGrid = new OpenCLArray(cl, gridSizeX*gridSizeY*gridSizeZ, 2*elementSize, "pmeGrid");
-            cl.addAutoclearBuffer(*pmeGrid);
-            pmeGrid2 = new OpenCLArray(cl, gridSizeX*gridSizeY*gridSizeZ, 2*elementSize, "pmeGrid2");
-            pmeBsplineModuliX = new OpenCLArray(cl, gridSizeX, elementSize, "pmeBsplineModuliX");
-            pmeBsplineModuliY = new OpenCLArray(cl, gridSizeY, elementSize, "pmeBsplineModuliY");
-            pmeBsplineModuliZ = new OpenCLArray(cl, gridSizeZ, elementSize, "pmeBsplineModuliZ");
-            pmeBsplineTheta = new OpenCLArray(cl, PmeOrder*numParticles, 4*elementSize, "pmeBsplineTheta");
-            pmeAtomRange = OpenCLArray::create<cl_int>(cl, gridSizeX*gridSizeY*gridSizeZ+1, "pmeAtomRange");
-            pmeAtomGridIndex = OpenCLArray::create<mm_int2>(cl, numParticles, "pmeAtomGridIndex");
-            sort = new OpenCLSort(cl, new SortTrait(), cl.getNumAtoms());
-            fft = new OpenCLFFT3D(cl, gridSizeX, gridSizeY, gridSizeZ);
-            // Initialize the b-spline moduli.
-            int maxSize = max(max(gridSizeX, gridSizeY), gridSizeZ);
-            vector<double> data(PmeOrder);
-            vector<double> ddata(PmeOrder);
-            vector<double> bsplines_data(maxSize);
-            data[PmeOrder-1] = 0.0;
-            data[1] = 0.0;
-            data[0] = 1.0;
-            for (int i = 3; i < PmeOrder; i++) {
-                double div = 1.0/(i-1.0);
-                data[i-1] = 0.0;
-                for (int j = 1; j < (i-1); j++)
-                    data[i-j-1] = div*(j*data[i-j-2]+(i-j)*data[i-j-1]);
-                data[0] = div*data[0];
-            }
-            // Differentiate.
-            ddata[0] = -data[0];
-            for (int i = 1; i < PmeOrder; i++)
-                ddata[i] = data[i-1]-data[i];
-            double div = 1.0/(PmeOrder-1);
-            data[PmeOrder-1] = 0.0;
-            for (int i = 1; i < (PmeOrder-1); i++)
-                data[PmeOrder-i-1] = div*(i*data[PmeOrder-i-2]+(PmeOrder-i)*data[PmeOrder-i-1]);
-            data[0] = div*data[0];
-            for (int i = 0; i < maxSize; i++)
-                bsplines_data[i] = 0.0;
-            for (int i = 1; i <= PmeOrder; i++)
-                bsplines_data[i] = data[i-1];
-            // Evaluate the actual bspline moduli for X/Y/Z.
-            for(int dim = 0; dim < 3; dim++) {
-                int ndata = (dim == 0 ? gridSizeX : dim == 1 ? gridSizeY : gridSizeZ);
-                vector<cl_double> moduli(ndata);
-                for (int i = 0; i < ndata; i++) {
-                    double sc = 0.0;
-                    double ss = 0.0;
-                    for (int j = 0; j < ndata; j++) {
-                        double arg = (2.0*M_PI*i*j)/ndata;
-                        sc += bsplines_data[j]*cos(arg);
-                        ss += bsplines_data[j]*sin(arg);
-                    }
-                    moduli[i] = (float) (sc*sc+ss*ss);
                }
-                for (int i = 0; i < ndata; i++)
+                catch (OpenMMException& ex) {
-                {
+                    // The CPU PME plugin isn't available.
-                    if (moduli[i] < 1.0e-7)
-                        moduli[i] = (moduli[i-1]+moduli[i+1])*0.5f;
                }
-                if (cl.getUseDoublePrecision()) {
+            }
-                    if (dim == 0)
+            if (pmeio == NULL) {
-                        pmeBsplineModuliX->upload(moduli);
+                // Create required data structures.
-                    else if (dim == 1)
-                        pmeBsplineModuliY->upload(moduli);
+                int elementSize = (cl.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
-                    else
+                pmeGrid = new OpenCLArray(cl, gridSizeX*gridSizeY*gridSizeZ, 2*elementSize, "pmeGrid");
-                        pmeBsplineModuliZ->upload(moduli);
+                cl.addAutoclearBuffer(*pmeGrid);
+                pmeGrid2 = new OpenCLArray(cl, gridSizeX*gridSizeY*gridSizeZ, 2*elementSize, "pmeGrid2");
+                pmeBsplineModuliX = new OpenCLArray(cl, gridSizeX, elementSize, "pmeBsplineModuliX");
+                pmeBsplineModuliY = new OpenCLArray(cl, gridSizeY, elementSize, "pmeBsplineModuliY");
+                pmeBsplineModuliZ = new OpenCLArray(cl, gridSizeZ, elementSize, "pmeBsplineModuliZ");
+                pmeBsplineTheta = new OpenCLArray(cl, PmeOrder*numParticles, 4*elementSize, "pmeBsplineTheta");
+                pmeAtomRange = OpenCLArray::create<cl_int>(cl, gridSizeX*gridSizeY*gridSizeZ+1, "pmeAtomRange");
+                pmeAtomGridIndex = OpenCLArray::create<mm_int2>(cl, numParticles, "pmeAtomGridIndex");
+                sort = new OpenCLSort(cl, new SortTrait(), cl.getNumAtoms());
+                fft = new OpenCLFFT3D(cl, gridSizeX, gridSizeY, gridSizeZ);
+                // Initialize the b-spline moduli.
+                int maxSize = max(max(gridSizeX, gridSizeY), gridSizeZ);
+                vector<double> data(PmeOrder);
+                vector<double> ddata(PmeOrder);
+                vector<double> bsplines_data(maxSize);
+                data[PmeOrder-1] = 0.0;
+                data[1] = 0.0;
+                data[0] = 1.0;
+                for (int i = 3; i < PmeOrder; i++) {
+                    double div = 1.0/(i-1.0);
+                    data[i-1] = 0.0;
+                    for (int j = 1; j < (i-1); j++)
+                        data[i-j-1] = div*(j*data[i-j-2]+(i-j)*data[i-j-1]);
+                    data[0] = div*data[0];
                }
-                else {
-                    vector<float> modulif(ndata);
+                // Differentiate.
+                ddata[0] = -data[0];
+                for (int i = 1; i < PmeOrder; i++)
+                    ddata[i] = data[i-1]-data[i];
+                double div = 1.0/(PmeOrder-1);
+                data[PmeOrder-1] = 0.0;
+                for (int i = 1; i < (PmeOrder-1); i++)
+                    data[PmeOrder-i-1] = div*(i*data[PmeOrder-i-2]+(PmeOrder-i)*data[PmeOrder-i-1]);
+                data[0] = div*data[0];
+                for (int i = 0; i < maxSize; i++)
+                    bsplines_data[i] = 0.0;
+                for (int i = 1; i <= PmeOrder; i++)
+                    bsplines_data[i] = data[i-1];
+                // Evaluate the actual bspline moduli for X/Y/Z.
+                for(int dim = 0; dim < 3; dim++) {
+                    int ndata = (dim == 0 ? gridSizeX : dim == 1 ? gridSizeY : gridSizeZ);
+                    vector<cl_double> moduli(ndata);
+                    for (int i = 0; i < ndata; i++) {
+                        double sc = 0.0;
+                        double ss = 0.0;
+                        for (int j = 0; j < ndata; j++) {
+                            double arg = (2.0*M_PI*i*j)/ndata;
+                            sc += bsplines_data[j]*cos(arg);
+                            ss += bsplines_data[j]*sin(arg);
+                        }
+                        moduli[i] = (float) (sc*sc+ss*ss);
+                    }
                    for (int i = 0; i < ndata; i++)
-                        modulif[i] = (float) moduli[i];
+                    {
-                    if (dim == 0)
+                        if (moduli[i] < 1.0e-7)
-                        pmeBsplineModuliX->upload(modulif);
+                            moduli[i] = (moduli[i-1]+moduli[i+1])*0.5f;
-                    else if (dim == 1)
+                    }
-                        pmeBsplineModuliY->upload(modulif);
+                    if (cl.getUseDoublePrecision()) {
-                    else
+                        if (dim == 0)
-                        pmeBsplineModuliZ->upload(modulif);
+                            pmeBsplineModuliX->upload(moduli);
+                        else if (dim == 1)
+                            pmeBsplineModuliY->upload(moduli);
+                        else
+                            pmeBsplineModuliZ->upload(moduli);
+                    }
+                    else {
+                        vector<float> modulif(ndata);
+                        for (int i = 0; i < ndata; i++)
+                            modulif[i] = (float) moduli[i];
+                        if (dim == 0)
+                            pmeBsplineModuliX->upload(modulif);
+                        else if (dim == 1)
+                            pmeBsplineModuliY->upload(modulif);
+                        else
+                            pmeBsplineModuliZ->upload(modulif);
+                    }
                }
            }
        }

--- a/platforms/opencl/tests/TestOpenCLNonbondedForce.cpp
+++ b/platforms/opencl/tests/TestOpenCLNonbondedForce.cpp
@@ -751,7 +751,7 @@ void testChangingParameters() {
    ASSERT_EQUAL_TOL(clState.getPotentialEnergy(), referenceState.getPotentialEnergy(), tol);
 }
-void testParallelComputation(bool useCutoff) {
+void testParallelComputation(NonbondedForce::NonbondedMethod method) {
    System system;
    const int numParticles = 200;
    for (int i = 0; i < numParticles; i++)
@@ -759,9 +759,9 @@ void testParallelComputation(bool useCutoff) {
    NonbondedForce* force = new NonbondedForce();
    for (int i = 0; i < numParticles; i++)
        force->addParticle(i%2-0.5, 0.5, 1.0);
-    if (useCutoff)
+    force->setNonbondedMethod(method);
-        force->setNonbondedMethod(NonbondedForce::CutoffNonPeriodic);
    system.addForce(force);
+    system.setDefaultPeriodicBoxVectors(Vec3(5,0,0), Vec3(0,5,0), Vec3(0,0,5));
    OpenMM_SFMT::SFMT sfmt;
    init_gen_rand(0, sfmt);
    vector<Vec3> positions(numParticles);
@@ -880,8 +880,9 @@ int main(int argc, char* argv[]) {
 //        testBlockInteractions(true);
        testDispersionCorrection();
        testChangingParameters();
-        testParallelComputation(false);
+        testParallelComputation(NonbondedForce::NoCutoff);
-        testParallelComputation(true);
+        testParallelComputation(NonbondedForce::Ewald);
+        testParallelComputation(NonbondedForce::PME);
        testSwitchingFunction(NonbondedForce::CutoffNonPeriodic);
        testSwitchingFunction(NonbondedForce::PME);
    }