Merge remote-tracking branch 'upstream/master'

51e51c3b · John Chodera (MSKCC) · bc240d4a · 373c3878 · 51e51c3b · 51e51c3b
Commit 51e51c3b authored Jun 22, 2014 by John Chodera (MSKCC)
20 changed files
--- a/platforms/cuda/tests/TestCudaNonbondedForce.cpp
+++ b/platforms/cuda/tests/TestCudaNonbondedForce.cpp
@@ -748,7 +748,7 @@ void testChangingParameters() {
    ASSERT_EQUAL_TOL(cuState.getPotentialEnergy(), referenceState.getPotentialEnergy(), tol);
 }

-void testParallelComputation(bool useCutoff) {
+void testParallelComputation(NonbondedForce::NonbondedMethod method) {
    System system;
    const int numParticles = 200;
    for (int i = 0; i < numParticles; i++)
@@ -756,9 +756,9 @@ void testParallelComputation(bool useCutoff) {
    NonbondedForce* force = new NonbondedForce();
    for (int i = 0; i < numParticles; i++)
        force->addParticle(i%2-0.5, 0.5, 1.0);
-    if (useCutoff)
-        force->setNonbondedMethod(NonbondedForce::CutoffNonPeriodic);
+    force->setNonbondedMethod(method);
    system.addForce(force);
+    system.setDefaultPeriodicBoxVectors(Vec3(5,0,0), Vec3(0,5,0), Vec3(0,0,5));
    OpenMM_SFMT::SFMT sfmt;
    init_gen_rand(0, sfmt);
    vector<Vec3> positions(numParticles);
@@ -877,8 +877,9 @@ int main(int argc, char* argv[]) {
        //testBlockInteractions(true);
        testDispersionCorrection();
        testChangingParameters();
-        testParallelComputation(false);
-        testParallelComputation(true);
+        testParallelComputation(NonbondedForce::NoCutoff);
+        testParallelComputation(NonbondedForce::Ewald);
+        testParallelComputation(NonbondedForce::PME);
        testSwitchingFunction(NonbondedForce::CutoffNonPeriodic);
        testSwitchingFunction(NonbondedForce::PME);
    }

--- a/platforms/cuda/tests/TestCudaVirtualSites.cpp
+++ b/platforms/cuda/tests/TestCudaVirtualSites.cpp
@@ -524,6 +524,47 @@ void testReordering() {
    }
 }

+/**
+ * Test a System where multiple virtual sites are all calculated from the same particles.
+ */
+void testOverlappingSites() {
+    System system;
+    system.addParticle(1.0);
+    system.addParticle(1.0);
+    system.addParticle(1.0);
+    NonbondedForce* nonbonded = new NonbondedForce();
+    system.addForce(nonbonded);
+    nonbonded->addParticle(1.0, 0.0, 0.0);
+    nonbonded->addParticle(-0.5, 0.0, 0.0);
+    nonbonded->addParticle(-0.5, 0.0, 0.0);
+    vector<Vec3> positions;
+    positions.push_back(Vec3(0, 0, 0));
+    positions.push_back(Vec3(10, 0, 0));
+    positions.push_back(Vec3(0, 10, 0));
+    for (int i = 0; i < 20; i++) {
+        system.addParticle(0.0);
+        double u = 0.1*((i+1)%4);
+        double v = 0.05*i;
+        system.setVirtualSite(3+i, new ThreeParticleAverageSite(0, 1, 2, u, v, 1-u-v));
+        nonbonded->addParticle(i%2 == 0 ? -1.0 : 1.0, 0.0, 0.0);
+        positions.push_back(Vec3());
+    }
+    VerletIntegrator i1(0.002);
+    VerletIntegrator i2(0.002);
+    Context c1(system, i1, Platform::getPlatformByName("Reference"));
+    Context c2(system, i2, platform);
+    c1.setPositions(positions);
+    c2.setPositions(positions);
+    c1.applyConstraints(0.0001);
+    c2.applyConstraints(0.0001);
+    State s1 = c1.getState(State::Positions | State::Forces);
+    State s2 = c2.getState(State::Positions | State::Forces);
+    for (int i = 0; i < system.getNumParticles(); i++)
+        ASSERT_EQUAL_VEC(s1.getPositions()[i], s2.getPositions()[i], 1e-5);
+    for (int i = 0; i < 3; i++)
+        ASSERT_EQUAL_VEC(s1.getForces()[i], s2.getForces()[i], 1e-5);
+}
+
 int main(int argc, char* argv[]) {
    try {
        if (argc > 1)
@@ -535,6 +576,7 @@ int main(int argc, char* argv[]) {
        testLocalCoordinates();
        testConservationLaws();
        testReordering();
+        testOverlappingSites();
    }
    catch(const exception& e) {
        cout << "exception: " << e.what() << endl;

--- a/platforms/opencl/include/OpenCLIntegrationUtilities.h
+++ b/platforms/opencl/include/OpenCLIntegrationUtilities.h
@@ -121,7 +121,7 @@ private:
    cl::Kernel ccmaPosForceKernel, ccmaVelForceKernel;
    cl::Kernel ccmaMultiplyKernel;
    cl::Kernel ccmaPosUpdateKernel, ccmaVelUpdateKernel;
-    cl::Kernel vsitePositionKernel, vsiteForceKernel;
+    cl::Kernel vsitePositionKernel, vsiteForceKernel, vsiteAddForcesKernel;
    cl::Kernel randomKernel, timeShiftKernel;
    OpenCLArray* posDelta;
    OpenCLArray* settleAtoms;
@@ -152,7 +152,7 @@ private:
    OpenCLArray* vsiteLocalCoordsParams;
    int randomPos;
    int lastSeed, numVsites;
-    bool hasInitializedPosConstraintKernels, hasInitializedVelConstraintKernels, ccmaUseDirectBuffer;
+    bool hasInitializedPosConstraintKernels, hasInitializedVelConstraintKernels, ccmaUseDirectBuffer, hasOverlappingVsites;
    struct ShakeCluster;
    struct ConstraintOrderer;
 };

--- a/platforms/opencl/src/OpenCLIntegrationUtilities.cpp
+++ b/platforms/opencl/src/OpenCLIntegrationUtilities.cpp
@@ -101,7 +101,7 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c
        ccmaConstraintMatrixValue(NULL), ccmaDelta1(NULL), ccmaDelta2(NULL), ccmaConverged(NULL), ccmaConvergedHostBuffer(NULL),
        vsite2AvgAtoms(NULL), vsite2AvgWeights(NULL), vsite3AvgAtoms(NULL), vsite3AvgWeights(NULL),
        vsiteOutOfPlaneAtoms(NULL), vsiteOutOfPlaneWeights(NULL), vsiteLocalCoordsAtoms(NULL), vsiteLocalCoordsParams(NULL),
-        hasInitializedPosConstraintKernels(false), hasInitializedVelConstraintKernels(false) {
+        hasInitializedPosConstraintKernels(false), hasInitializedVelConstraintKernels(false), hasOverlappingVsites(false) {
    // Create workspace arrays.

    if (context.getUseDoublePrecision() || context.getUseMixedPrecision()) {
@@ -649,6 +649,7 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c
    int num3Avg = vsite3AvgAtomVec.size();
    int numOutOfPlane = vsiteOutOfPlaneAtomVec.size();
    int numLocalCoords = vsiteLocalCoordsAtomVec.size();
+    numVsites = num2Avg+num3Avg+numOutOfPlane+numLocalCoords;
    vsite2AvgAtoms = OpenCLArray::create<mm_int4>(context, max(1, num2Avg), "vsite2AvgAtoms");
    vsite3AvgAtoms = OpenCLArray::create<mm_int4>(context, max(1, num3Avg), "vsite3AvgAtoms");
    vsiteOutOfPlaneAtoms = OpenCLArray::create<mm_int4>(context, max(1, numOutOfPlane), "vsiteOutOfPlaneAtoms");
@@ -706,6 +707,20 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c
        }
    }
    
+    // If multiple virtual sites depend on the same particle, make sure the force distribution
+    // can be done safely.
+    
+    vector<int> atomCounts(numAtoms, 0);
+    for (int i = 0; i < numAtoms; i++)
+        if (system.isVirtualSite(i))
+            for (int j = 0; j < system.getVirtualSite(i).getNumParticles(); j++)
+                atomCounts[system.getVirtualSite(i).getParticle(j)]++;
+    for (int i = 0; i < numAtoms; i++)
+        if (atomCounts[i] > 1)
+            hasOverlappingVsites = true;
+    if (hasOverlappingVsites && context.getUseDoublePrecision() && !context.getSupports64BitGlobalAtomics())
+        throw OpenMMException("This device does not support 64 bit atomics.  Cannot use double precision when multiple virtual sites depend on the same atom.");
+    
    // Create the kernels for virtual sites.

    map<string, string> defines;
@@ -713,6 +728,10 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c
    defines["NUM_3_AVERAGE"] = context.intToString(num3Avg);
    defines["NUM_OUT_OF_PLANE"] = context.intToString(numOutOfPlane);
    defines["NUM_LOCAL_COORDS"] = context.intToString(numLocalCoords);
+    defines["NUM_ATOMS"] = context.intToString(numAtoms);
+    defines["PADDED_NUM_ATOMS"] = context.intToString(context.getPaddedNumAtoms());
+    if (hasOverlappingVsites)
+        defines["HAS_OVERLAPPING_VSITES"] = "1";
    cl::Program vsiteProgram = context.createProgram(OpenCLKernelSources::virtualSites, defines);
    vsitePositionKernel = cl::Kernel(vsiteProgram, "computeVirtualSites");
    int index = 0;
@@ -731,6 +750,8 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c
    index = 0;
    vsiteForceKernel.setArg<cl::Buffer>(index++, context.getPosq().getDeviceBuffer());
    index++; // Skip argument 1: the force array hasn't been created yet.
+    if (context.getSupports64BitGlobalAtomics())
+        index++; // Skip argument 2: the force array hasn't been created yet.
    if (context.getUseMixedPrecision())
        vsiteForceKernel.setArg<cl::Buffer>(index++, context.getPosqCorrection().getDeviceBuffer());
    vsiteForceKernel.setArg<cl::Buffer>(index++, vsite2AvgAtoms->getDeviceBuffer());
@@ -741,7 +762,8 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c
    vsiteForceKernel.setArg<cl::Buffer>(index++, vsiteOutOfPlaneWeights->getDeviceBuffer());
    vsiteForceKernel.setArg<cl::Buffer>(index++, vsiteLocalCoordsAtoms->getDeviceBuffer());
    vsiteForceKernel.setArg<cl::Buffer>(index++, vsiteLocalCoordsParams->getDeviceBuffer());
-    numVsites = num2Avg+num3Avg+numOutOfPlane+numLocalCoords;
+    if (hasOverlappingVsites && context.getSupports64BitGlobalAtomics())
+        vsiteAddForcesKernel = cl::Kernel(vsiteProgram, "addDistributedForces");
 }

 OpenCLIntegrationUtilities::~OpenCLIntegrationUtilities() {
@@ -941,8 +963,25 @@ void OpenCLIntegrationUtilities::computeVirtualSites() {

 void OpenCLIntegrationUtilities::distributeForcesFromVirtualSites() {
    if (numVsites > 0) {
+        // Set arguments that didn't exist yet in the constructor.
+        
        vsiteForceKernel.setArg<cl::Buffer>(1, context.getForce().getDeviceBuffer());
+        if (context.getSupports64BitGlobalAtomics()) {
+            vsiteForceKernel.setArg<cl::Buffer>(2, context.getLongForceBuffer().getDeviceBuffer());
+            if (hasOverlappingVsites) {
+                // We'll be using 64 bit atomics for the force redistribution, so clear the buffer.
+                
+                context.clearBuffer(context.getLongForceBuffer());
+            }
+        }
        context.executeKernel(vsiteForceKernel, numVsites);
+        if (context.getSupports64BitGlobalAtomics() && hasOverlappingVsites) {
+            // Add the redistributed forces from the virtual sites to the main force array.
+            
+            vsiteAddForcesKernel.setArg<cl::Buffer>(0, context.getLongForceBuffer().getDeviceBuffer());
+            vsiteAddForcesKernel.setArg<cl::Buffer>(1, context.getForce().getDeviceBuffer());
+            context.executeKernel(vsiteAddForcesKernel, context.getNumAtoms());
+        }
    }
 }


--- a/platforms/opencl/src/OpenCLKernels.cpp
+++ b/platforms/opencl/src/OpenCLKernels.cpp
@@ -1492,7 +1492,7 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
    else
        dispersionCoefficient = 0.0;
    alpha = 0;
-    if (force.getNonbondedMethod() == NonbondedForce::Ewald && cl.getContextIndex() == 0) {
+    if (force.getNonbondedMethod() == NonbondedForce::Ewald) {
        // Compute the Ewald parameters.

        int kmaxx, kmaxy, kmaxz;
@@ -1500,23 +1500,25 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
        defines["EWALD_ALPHA"] = cl.doubleToString(alpha);
        defines["TWO_OVER_SQRT_PI"] = cl.doubleToString(2.0/sqrt(M_PI));
        defines["USE_EWALD"] = "1";
-        ewaldSelfEnergy = -ONE_4PI_EPS0*alpha*sumSquaredCharges/sqrt(M_PI);
+        if (cl.getContextIndex() == 0) {
+            ewaldSelfEnergy = -ONE_4PI_EPS0*alpha*sumSquaredCharges/sqrt(M_PI);

-        // Create the reciprocal space kernels.
+            // Create the reciprocal space kernels.

-        map<string, string> replacements;
-        replacements["NUM_ATOMS"] = cl.intToString(numParticles);
-        replacements["KMAX_X"] = cl.intToString(kmaxx);
-        replacements["KMAX_Y"] = cl.intToString(kmaxy);
-        replacements["KMAX_Z"] = cl.intToString(kmaxz);
-        replacements["EXP_COEFFICIENT"] = cl.doubleToString(-1.0/(4.0*alpha*alpha));
-        cl::Program program = cl.createProgram(OpenCLKernelSources::ewald, replacements);
-        ewaldSumsKernel = cl::Kernel(program, "calculateEwaldCosSinSums");
-        ewaldForcesKernel = cl::Kernel(program, "calculateEwaldForces");
-        int elementSize = (cl.getUseDoublePrecision() ? sizeof(mm_double2) : sizeof(mm_float2));
-        cosSinSums = new OpenCLArray(cl, (2*kmaxx-1)*(2*kmaxy-1)*(2*kmaxz-1), elementSize, "cosSinSums");
-    }
-    else if (force.getNonbondedMethod() == NonbondedForce::PME && cl.getContextIndex() == 0) {
+            map<string, string> replacements;
+            replacements["NUM_ATOMS"] = cl.intToString(numParticles);
+            replacements["KMAX_X"] = cl.intToString(kmaxx);
+            replacements["KMAX_Y"] = cl.intToString(kmaxy);
+            replacements["KMAX_Z"] = cl.intToString(kmaxz);
+            replacements["EXP_COEFFICIENT"] = cl.doubleToString(-1.0/(4.0*alpha*alpha));
+            cl::Program program = cl.createProgram(OpenCLKernelSources::ewald, replacements);
+            ewaldSumsKernel = cl::Kernel(program, "calculateEwaldCosSinSums");
+            ewaldForcesKernel = cl::Kernel(program, "calculateEwaldForces");
+            int elementSize = (cl.getUseDoublePrecision() ? sizeof(mm_double2) : sizeof(mm_float2));
+            cosSinSums = new OpenCLArray(cl, (2*kmaxx-1)*(2*kmaxy-1)*(2*kmaxz-1), elementSize, "cosSinSums");
+        }
+    }
+    else if (force.getNonbondedMethod() == NonbondedForce::PME) {
        // Compute the PME parameters.

        int gridSizeX, gridSizeY, gridSizeZ;
@@ -1527,119 +1529,121 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
        defines["EWALD_ALPHA"] = cl.doubleToString(alpha);
        defines["TWO_OVER_SQRT_PI"] = cl.doubleToString(2.0/sqrt(M_PI));
        defines["USE_EWALD"] = "1";
-        ewaldSelfEnergy = -ONE_4PI_EPS0*alpha*sumSquaredCharges/sqrt(M_PI);
-        pmeDefines["PME_ORDER"] = cl.intToString(PmeOrder);
-        pmeDefines["NUM_ATOMS"] = cl.intToString(numParticles);
-        pmeDefines["RECIP_EXP_FACTOR"] = cl.doubleToString(M_PI*M_PI/(alpha*alpha));
-        pmeDefines["GRID_SIZE_X"] = cl.intToString(gridSizeX);
-        pmeDefines["GRID_SIZE_Y"] = cl.intToString(gridSizeY);
-        pmeDefines["GRID_SIZE_Z"] = cl.intToString(gridSizeZ);
-        pmeDefines["EPSILON_FACTOR"] = cl.doubleToString(sqrt(ONE_4PI_EPS0));
-        bool deviceIsCpu = (cl.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU);
-        if (deviceIsCpu)
-            pmeDefines["DEVICE_IS_CPU"] = "1";
-        if (cl.getPlatformData().useCpuPme) {
-            // Create the CPU PME kernel.
-            
-            try {
-                cpuPme = getPlatform().createKernel(CalcPmeReciprocalForceKernel::Name(), *cl.getPlatformData().context);
-                cpuPme.getAs<CalcPmeReciprocalForceKernel>().initialize(gridSizeX, gridSizeY, gridSizeZ, numParticles, alpha);
-                cl::Program program = cl.createProgram(OpenCLKernelSources::pme, pmeDefines);
-                cl::Kernel addForcesKernel = cl::Kernel(program, "addForces");
-                pmeio = new PmeIO(cl, addForcesKernel);
-                cl.addPreComputation(new PmePreComputation(cl, cpuPme, *pmeio));
-                cl.addPostComputation(new PmePostComputation(cpuPme, *pmeio));
-            }
-            catch (OpenMMException& ex) {
-                // The CPU PME plugin isn't available.
-            }
-        }
-        if (pmeio == NULL) {
-            // Create required data structures.
-
-            int elementSize = (cl.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
-            pmeGrid = new OpenCLArray(cl, gridSizeX*gridSizeY*gridSizeZ, 2*elementSize, "pmeGrid");
-            cl.addAutoclearBuffer(*pmeGrid);
-            pmeGrid2 = new OpenCLArray(cl, gridSizeX*gridSizeY*gridSizeZ, 2*elementSize, "pmeGrid2");
-            pmeBsplineModuliX = new OpenCLArray(cl, gridSizeX, elementSize, "pmeBsplineModuliX");
-            pmeBsplineModuliY = new OpenCLArray(cl, gridSizeY, elementSize, "pmeBsplineModuliY");
-            pmeBsplineModuliZ = new OpenCLArray(cl, gridSizeZ, elementSize, "pmeBsplineModuliZ");
-            pmeBsplineTheta = new OpenCLArray(cl, PmeOrder*numParticles, 4*elementSize, "pmeBsplineTheta");
-            pmeAtomRange = OpenCLArray::create<cl_int>(cl, gridSizeX*gridSizeY*gridSizeZ+1, "pmeAtomRange");
-            pmeAtomGridIndex = OpenCLArray::create<mm_int2>(cl, numParticles, "pmeAtomGridIndex");
-            sort = new OpenCLSort(cl, new SortTrait(), cl.getNumAtoms());
-            fft = new OpenCLFFT3D(cl, gridSizeX, gridSizeY, gridSizeZ);
-
-            // Initialize the b-spline moduli.
-
-            int maxSize = max(max(gridSizeX, gridSizeY), gridSizeZ);
-            vector<double> data(PmeOrder);
-            vector<double> ddata(PmeOrder);
-            vector<double> bsplines_data(maxSize);
-            data[PmeOrder-1] = 0.0;
-            data[1] = 0.0;
-            data[0] = 1.0;
-            for (int i = 3; i < PmeOrder; i++) {
-                double div = 1.0/(i-1.0);
-                data[i-1] = 0.0;
-                for (int j = 1; j < (i-1); j++)
-                    data[i-j-1] = div*(j*data[i-j-2]+(i-j)*data[i-j-1]);
-                data[0] = div*data[0];
-            }
-
-            // Differentiate.
-
-            ddata[0] = -data[0];
-            for (int i = 1; i < PmeOrder; i++)
-                ddata[i] = data[i-1]-data[i];
-            double div = 1.0/(PmeOrder-1);
-            data[PmeOrder-1] = 0.0;
-            for (int i = 1; i < (PmeOrder-1); i++)
-                data[PmeOrder-i-1] = div*(i*data[PmeOrder-i-2]+(PmeOrder-i)*data[PmeOrder-i-1]);
-            data[0] = div*data[0];
-            for (int i = 0; i < maxSize; i++)
-                bsplines_data[i] = 0.0;
-            for (int i = 1; i <= PmeOrder; i++)
-                bsplines_data[i] = data[i-1];
-
-            // Evaluate the actual bspline moduli for X/Y/Z.
-
-            for(int dim = 0; dim < 3; dim++) {
-                int ndata = (dim == 0 ? gridSizeX : dim == 1 ? gridSizeY : gridSizeZ);
-                vector<cl_double> moduli(ndata);
-                for (int i = 0; i < ndata; i++) {
-                    double sc = 0.0;
-                    double ss = 0.0;
-                    for (int j = 0; j < ndata; j++) {
-                        double arg = (2.0*M_PI*i*j)/ndata;
-                        sc += bsplines_data[j]*cos(arg);
-                        ss += bsplines_data[j]*sin(arg);
-                    }
-                    moduli[i] = (float) (sc*sc+ss*ss);
+        if (cl.getContextIndex() == 0) {
+            ewaldSelfEnergy = -ONE_4PI_EPS0*alpha*sumSquaredCharges/sqrt(M_PI);
+            pmeDefines["PME_ORDER"] = cl.intToString(PmeOrder);
+            pmeDefines["NUM_ATOMS"] = cl.intToString(numParticles);
+            pmeDefines["RECIP_EXP_FACTOR"] = cl.doubleToString(M_PI*M_PI/(alpha*alpha));
+            pmeDefines["GRID_SIZE_X"] = cl.intToString(gridSizeX);
+            pmeDefines["GRID_SIZE_Y"] = cl.intToString(gridSizeY);
+            pmeDefines["GRID_SIZE_Z"] = cl.intToString(gridSizeZ);
+            pmeDefines["EPSILON_FACTOR"] = cl.doubleToString(sqrt(ONE_4PI_EPS0));
+            bool deviceIsCpu = (cl.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU);
+            if (deviceIsCpu)
+                pmeDefines["DEVICE_IS_CPU"] = "1";
+            if (cl.getPlatformData().useCpuPme) {
+                // Create the CPU PME kernel.
+
+                try {
+                    cpuPme = getPlatform().createKernel(CalcPmeReciprocalForceKernel::Name(), *cl.getPlatformData().context);
+                    cpuPme.getAs<CalcPmeReciprocalForceKernel>().initialize(gridSizeX, gridSizeY, gridSizeZ, numParticles, alpha);
+                    cl::Program program = cl.createProgram(OpenCLKernelSources::pme, pmeDefines);
+                    cl::Kernel addForcesKernel = cl::Kernel(program, "addForces");
+                    pmeio = new PmeIO(cl, addForcesKernel);
+                    cl.addPreComputation(new PmePreComputation(cl, cpuPme, *pmeio));
+                    cl.addPostComputation(new PmePostComputation(cpuPme, *pmeio));
                }
-                for (int i = 0; i < ndata; i++)
-                {
-                    if (moduli[i] < 1.0e-7)
-                        moduli[i] = (moduli[i-1]+moduli[i+1])*0.5f;
+                catch (OpenMMException& ex) {
+                    // The CPU PME plugin isn't available.
                }
-                if (cl.getUseDoublePrecision()) {
-                    if (dim == 0)
-                        pmeBsplineModuliX->upload(moduli);
-                    else if (dim == 1)
-                        pmeBsplineModuliY->upload(moduli);
-                    else
-                        pmeBsplineModuliZ->upload(moduli);
+            }
+            if (pmeio == NULL) {
+                // Create required data structures.
+
+                int elementSize = (cl.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
+                pmeGrid = new OpenCLArray(cl, gridSizeX*gridSizeY*gridSizeZ, 2*elementSize, "pmeGrid");
+                cl.addAutoclearBuffer(*pmeGrid);
+                pmeGrid2 = new OpenCLArray(cl, gridSizeX*gridSizeY*gridSizeZ, 2*elementSize, "pmeGrid2");
+                pmeBsplineModuliX = new OpenCLArray(cl, gridSizeX, elementSize, "pmeBsplineModuliX");
+                pmeBsplineModuliY = new OpenCLArray(cl, gridSizeY, elementSize, "pmeBsplineModuliY");
+                pmeBsplineModuliZ = new OpenCLArray(cl, gridSizeZ, elementSize, "pmeBsplineModuliZ");
+                pmeBsplineTheta = new OpenCLArray(cl, PmeOrder*numParticles, 4*elementSize, "pmeBsplineTheta");
+                pmeAtomRange = OpenCLArray::create<cl_int>(cl, gridSizeX*gridSizeY*gridSizeZ+1, "pmeAtomRange");
+                pmeAtomGridIndex = OpenCLArray::create<mm_int2>(cl, numParticles, "pmeAtomGridIndex");
+                sort = new OpenCLSort(cl, new SortTrait(), cl.getNumAtoms());
+                fft = new OpenCLFFT3D(cl, gridSizeX, gridSizeY, gridSizeZ);
+
+                // Initialize the b-spline moduli.
+
+                int maxSize = max(max(gridSizeX, gridSizeY), gridSizeZ);
+                vector<double> data(PmeOrder);
+                vector<double> ddata(PmeOrder);
+                vector<double> bsplines_data(maxSize);
+                data[PmeOrder-1] = 0.0;
+                data[1] = 0.0;
+                data[0] = 1.0;
+                for (int i = 3; i < PmeOrder; i++) {
+                    double div = 1.0/(i-1.0);
+                    data[i-1] = 0.0;
+                    for (int j = 1; j < (i-1); j++)
+                        data[i-j-1] = div*(j*data[i-j-2]+(i-j)*data[i-j-1]);
+                    data[0] = div*data[0];
                }
-                else {
-                    vector<float> modulif(ndata);
+
+                // Differentiate.
+
+                ddata[0] = -data[0];
+                for (int i = 1; i < PmeOrder; i++)
+                    ddata[i] = data[i-1]-data[i];
+                double div = 1.0/(PmeOrder-1);
+                data[PmeOrder-1] = 0.0;
+                for (int i = 1; i < (PmeOrder-1); i++)
+                    data[PmeOrder-i-1] = div*(i*data[PmeOrder-i-2]+(PmeOrder-i)*data[PmeOrder-i-1]);
+                data[0] = div*data[0];
+                for (int i = 0; i < maxSize; i++)
+                    bsplines_data[i] = 0.0;
+                for (int i = 1; i <= PmeOrder; i++)
+                    bsplines_data[i] = data[i-1];
+
+                // Evaluate the actual bspline moduli for X/Y/Z.
+
+                for(int dim = 0; dim < 3; dim++) {
+                    int ndata = (dim == 0 ? gridSizeX : dim == 1 ? gridSizeY : gridSizeZ);
+                    vector<cl_double> moduli(ndata);
+                    for (int i = 0; i < ndata; i++) {
+                        double sc = 0.0;
+                        double ss = 0.0;
+                        for (int j = 0; j < ndata; j++) {
+                            double arg = (2.0*M_PI*i*j)/ndata;
+                            sc += bsplines_data[j]*cos(arg);
+                            ss += bsplines_data[j]*sin(arg);
+                        }
+                        moduli[i] = (float) (sc*sc+ss*ss);
+                    }
                    for (int i = 0; i < ndata; i++)
-                        modulif[i] = (float) moduli[i];
-                    if (dim == 0)
-                        pmeBsplineModuliX->upload(modulif);
-                    else if (dim == 1)
-                        pmeBsplineModuliY->upload(modulif);
-                    else
-                        pmeBsplineModuliZ->upload(modulif);
+                    {
+                        if (moduli[i] < 1.0e-7)
+                            moduli[i] = (moduli[i-1]+moduli[i+1])*0.5f;
+                    }
+                    if (cl.getUseDoublePrecision()) {
+                        if (dim == 0)
+                            pmeBsplineModuliX->upload(moduli);
+                        else if (dim == 1)
+                            pmeBsplineModuliY->upload(moduli);
+                        else
+                            pmeBsplineModuliZ->upload(moduli);
+                    }
+                    else {
+                        vector<float> modulif(ndata);
+                        for (int i = 0; i < ndata; i++)
+                            modulif[i] = (float) moduli[i];
+                        if (dim == 0)
+                            pmeBsplineModuliX->upload(modulif);
+                        else if (dim == 1)
+                            pmeBsplineModuliY->upload(modulif);
+                        else
+                            pmeBsplineModuliZ->upload(modulif);
+                    }
                }
            }
        }

--- a/platforms/opencl/src/kernels/virtualSites.cl
+++ b/platforms/opencl/src/kernels/virtualSites.cl
@@ -108,10 +108,66 @@ __kernel void computeVirtualSites(__global real4* restrict posq,
    }
 }

+#ifdef HAS_OVERLAPPING_VSITES
+    #ifdef SUPPORTS_64_BIT_ATOMICS
+        // We will use 64 bit atomics for force redistribution.
+
+        #define ADD_FORCE(index, f) addForce(index, f, longForce);
+
+        #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
+
+        void addForce(int index, real4 f,  __global long* longForce) {
+            atom_add(&longForce[index], (long) (f.x*0x100000000));
+            atom_add(&longForce[index+PADDED_NUM_ATOMS], (long) (f.y*0x100000000));
+            atom_add(&longForce[index+2*PADDED_NUM_ATOMS], (long) (f.z*0x100000000));
+        }
+
+        __kernel void addDistributedForces(__global const long* restrict longForces, __global real4* restrict forces) {
+            real scale = 1/(real) 0x100000000;
+            for (int index = get_global_id(0); index < NUM_ATOMS; index += get_global_size(0)) {
+                real4 f = (real4) (scale*longForces[index], scale*longForces[index+PADDED_NUM_ATOMS], scale*longForces[index+2*PADDED_NUM_ATOMS], 0);
+                forces[index] += f;
+            }
+        }
+    #else
+        // 64 bit atomics aren't supported, so we have to use atomic_cmpxchg() for force redistribution.
+        
+        #define ADD_FORCE(index, f) addForce(index, f, force);
+
+        void atomicAddFloat(__global float* p, float v) {
+            __global int* ip = (__global int*) p;
+            while (true) {
+                union {
+                    float f;
+                    int i;
+                } oldval, newval;
+                oldval.f = *p;
+                newval.f = oldval.f+v;
+                if (atomic_cmpxchg(ip, oldval.i, newval.i) == oldval.i)
+                    return;
+            }
+        }
+
+        void addForce(int index, float4 f, __global float4* force) {
+            __global float* components = (__global float*) force;
+            atomicAddFloat(&components[4*index], f.x);
+            atomicAddFloat(&components[4*index+1], f.y);
+            atomicAddFloat(&components[4*index+2], f.z);
+        }
+    #endif
+#else
+    // There are no overlapping virtual sites, so we can just store forces directly.
+
+    #define ADD_FORCE(index, f) force[index].xyz += (f).xyz;
+#endif
+
 /**
 * Distribute forces from virtual sites to the atoms they are based on.
 */
 __kernel void distributeForces(__global const real4* restrict posq, __global real4* restrict force,
+#ifdef SUPPORTS_64_BIT_ATOMICS
+        __global long* restrict longForce,
+#endif
 #ifdef USE_MIXED_PRECISION
        __global real4* restrict posqCorrection,
 #endif
@@ -129,12 +185,8 @@ __kernel void distributeForces(__global const real4* restrict posq, __global rea
        int4 atoms = avg2Atoms[index];
        real2 weights = avg2Weights[index];
        real4 f = force[atoms.x];
-        real4 f1 = force[atoms.y];
-        real4 f2 = force[atoms.z];
-        f1.xyz += f.xyz*weights.x;
-        f2.xyz += f.xyz*weights.y;
-        force[atoms.y] = f1;
-        force[atoms.z] = f2;
+        ADD_FORCE(atoms.y, f*weights.x);
+        ADD_FORCE(atoms.z, f*weights.y);
    }
    
    // Three particle average sites.
@@ -143,15 +195,9 @@ __kernel void distributeForces(__global const real4* restrict posq, __global rea
        int4 atoms = avg3Atoms[index];
        real4 weights = avg3Weights[index];
        real4 f = force[atoms.x];
-        real4 f1 = force[atoms.y];
-        real4 f2 = force[atoms.z];
-        real4 f3 = force[atoms.w];
-        f1.xyz += f.xyz*weights.x;
-        f2.xyz += f.xyz*weights.y;
-        f3.xyz += f.xyz*weights.z;
-        force[atoms.y] = f1;
-        force[atoms.z] = f2;
-        force[atoms.w] = f3;
+        ADD_FORCE(atoms.y, f*weights.x);
+        ADD_FORCE(atoms.z, f*weights.y);
+        ADD_FORCE(atoms.w, f*weights.z);
    }
    
    // Out of plane sites.
@@ -165,21 +211,15 @@ __kernel void distributeForces(__global const real4* restrict posq, __global rea
        mixed4 v12 = pos2-pos1;
        mixed4 v13 = pos3-pos1;
        real4 f = force[atoms.x];
-        real4 f1 = force[atoms.y];
-        real4 f2 = force[atoms.z];
-        real4 f3 = force[atoms.w];
        real4 fp2 = (real4) (weights.x*f.x - weights.z*v13.z*f.y + weights.z*v13.y*f.z,
                   weights.z*v13.z*f.x + weights.x*f.y - weights.z*v13.x*f.z,
                  -weights.z*v13.y*f.x + weights.z*v13.x*f.y + weights.x*f.z, 0.0f);
        real4 fp3 = (real4) (weights.y*f.x + weights.z*v12.z*f.y - weights.z*v12.y*f.z,
                  -weights.z*v12.z*f.x + weights.y*f.y + weights.z*v12.x*f.z,
                   weights.z*v12.y*f.x - weights.z*v12.x*f.y + weights.y*f.z, 0.0f);
-        f1.xyz += f.xyz-fp2.xyz-fp3.xyz;
-        f2.xyz += fp2.xyz;
-        f3.xyz += fp3.xyz;
-        force[atoms.y] = f1;
-        force[atoms.z] = f2;
-        force[atoms.w] = f3;
+        ADD_FORCE(atoms.y, f-fp2-fp3);
+        ADD_FORCE(atoms.z, fp2);
+        ADD_FORCE(atoms.w, fp3);
    }
    
    // Local coordinates sites.
@@ -230,9 +270,9 @@ __kernel void distributeForces(__global const real4* restrict posq, __global rea
        mixed sz3 = t32*dz.x-t31*dz.y;
        mixed4 wxScaled = wx*invNormXdir;
        real4 f = force[atoms.x];
-        real4 f1 = force[atoms.y];
-        real4 f2 = force[atoms.z];
-        real4 f3 = force[atoms.w];
+        real4 f1 = 0;
+        real4 f2 = 0;
+        real4 f3 = 0;
        mixed4 fp1 = localPosition*f.x;
        mixed4 fp2 = localPosition*f.y;
        mixed4 fp3 = localPosition*f.z;
@@ -263,8 +303,8 @@ __kernel void distributeForces(__global const real4* restrict posq, __global rea
        f3.x += fp3.x*wxScaled.z*( -dx.z*dx.x) + fp3.z*(dz.z*sx3+t32) + fp3.y*((-dx.x*dy.z-dz.y)*wxScaled.z + dy.z*sx3 + dx.x*t33);
        f3.y += fp3.x*wxScaled.z*( -dx.z*dx.y) + fp3.z*(dz.z*sy3-t31) + fp3.y*((-dx.y*dy.z+dz.x)*wxScaled.z + dy.z*sy3 + dx.y*t33);
        f3.z += fp3.x*wxScaled.z*(1-dx.z*dx.z) + fp3.z*(dz.z*sz3    ) + fp3.y*((-dx.z*dy.z     )*wxScaled.z + dy.z*sz3 - dx.x*t31 - dx.y*t32) + f.z*originWeights.z;
-        force[atoms.y] = f1;
-        force[atoms.z] = f2;
-        force[atoms.w] = f3;
+        ADD_FORCE(atoms.y, f1);
+        ADD_FORCE(atoms.z, f2);
+        ADD_FORCE(atoms.w, f3);
    }
 }
--- a/platforms/opencl/tests/TestOpenCLLocalEnergyMinimizer.cpp
+++ b/platforms/opencl/tests/TestOpenCLLocalEnergyMinimizer.cpp
@@ -7,7 +7,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2010 Stanford University and the Authors.           *
+ * Portions copyright (c) 2010-2014 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -170,6 +170,7 @@ void testVirtualSites() {
    VerletIntegrator integrator(0.01);
    Context context(system, integrator, platform);
    context.setPositions(positions);
+    context.applyConstraints(1e-5);
    State initialState = context.getState(State::Forces | State::Energy);
    LocalEnergyMinimizer::minimize(context, tolerance);
    State finalState = context.getState(State::Forces | State::Energy | State::Positions);

--- a/platforms/opencl/tests/TestOpenCLNonbondedForce.cpp
+++ b/platforms/opencl/tests/TestOpenCLNonbondedForce.cpp
@@ -751,7 +751,7 @@ void testChangingParameters() {
    ASSERT_EQUAL_TOL(clState.getPotentialEnergy(), referenceState.getPotentialEnergy(), tol);
 }

-void testParallelComputation(bool useCutoff) {
+void testParallelComputation(NonbondedForce::NonbondedMethod method) {
    System system;
    const int numParticles = 200;
    for (int i = 0; i < numParticles; i++)
@@ -759,9 +759,9 @@ void testParallelComputation(bool useCutoff) {
    NonbondedForce* force = new NonbondedForce();
    for (int i = 0; i < numParticles; i++)
        force->addParticle(i%2-0.5, 0.5, 1.0);
-    if (useCutoff)
-        force->setNonbondedMethod(NonbondedForce::CutoffNonPeriodic);
+    force->setNonbondedMethod(method);
    system.addForce(force);
+    system.setDefaultPeriodicBoxVectors(Vec3(5,0,0), Vec3(0,5,0), Vec3(0,0,5));
    OpenMM_SFMT::SFMT sfmt;
    init_gen_rand(0, sfmt);
    vector<Vec3> positions(numParticles);
@@ -880,8 +880,9 @@ int main(int argc, char* argv[]) {
 //        testBlockInteractions(true);
        testDispersionCorrection();
        testChangingParameters();
-        testParallelComputation(false);
-        testParallelComputation(true);
+        testParallelComputation(NonbondedForce::NoCutoff);
+        testParallelComputation(NonbondedForce::Ewald);
+        testParallelComputation(NonbondedForce::PME);
        testSwitchingFunction(NonbondedForce::CutoffNonPeriodic);
        testSwitchingFunction(NonbondedForce::PME);
    }

--- a/platforms/opencl/tests/TestOpenCLVirtualSites.cpp
+++ b/platforms/opencl/tests/TestOpenCLVirtualSites.cpp
@@ -524,6 +524,47 @@ void testReordering() {
    }
 }

+/**
+ * Test a System where multiple virtual sites are all calculated from the same particles.
+ */
+void testOverlappingSites() {
+    System system;
+    system.addParticle(1.0);
+    system.addParticle(1.0);
+    system.addParticle(1.0);
+    NonbondedForce* nonbonded = new NonbondedForce();
+    system.addForce(nonbonded);
+    nonbonded->addParticle(1.0, 0.0, 0.0);
+    nonbonded->addParticle(-0.5, 0.0, 0.0);
+    nonbonded->addParticle(-0.5, 0.0, 0.0);
+    vector<Vec3> positions;
+    positions.push_back(Vec3(0, 0, 0));
+    positions.push_back(Vec3(10, 0, 0));
+    positions.push_back(Vec3(0, 10, 0));
+    for (int i = 0; i < 20; i++) {
+        system.addParticle(0.0);
+        double u = 0.1*((i+1)%4);
+        double v = 0.05*i;
+        system.setVirtualSite(3+i, new ThreeParticleAverageSite(0, 1, 2, u, v, 1-u-v));
+        nonbonded->addParticle(i%2 == 0 ? -1.0 : 1.0, 0.0, 0.0);
+        positions.push_back(Vec3());
+    }
+    VerletIntegrator i1(0.002);
+    VerletIntegrator i2(0.002);
+    Context c1(system, i1, Platform::getPlatformByName("Reference"));
+    Context c2(system, i2, platform);
+    c1.setPositions(positions);
+    c2.setPositions(positions);
+    c1.applyConstraints(0.0001);
+    c2.applyConstraints(0.0001);
+    State s1 = c1.getState(State::Positions | State::Forces);
+    State s2 = c2.getState(State::Positions | State::Forces);
+    for (int i = 0; i < system.getNumParticles(); i++)
+        ASSERT_EQUAL_VEC(s1.getPositions()[i], s2.getPositions()[i], 1e-5);
+    for (int i = 0; i < 3; i++)
+        ASSERT_EQUAL_VEC(s1.getForces()[i], s2.getForces()[i], 1e-5);
+}
+
 int main(int argc, char* argv[]) {
    try {
        if (argc > 1)
@@ -535,6 +576,7 @@ int main(int argc, char* argv[]) {
        testLocalCoordinates();
        testConservationLaws();
        testReordering();
+        testOverlappingSites();
    }
    catch(const exception& e) {
        cout << "exception: " << e.what() << endl;

--- a/platforms/reference/tests/TestReferenceLocalEnergyMinimizer.cpp
+++ b/platforms/reference/tests/TestReferenceLocalEnergyMinimizer.cpp
@@ -7,7 +7,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2010 Stanford University and the Authors.           *
+ * Portions copyright (c) 2010-2014 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -171,6 +171,7 @@ void testVirtualSites() {
    VerletIntegrator integrator(0.01);
    Context context(system, integrator, platform);
    context.setPositions(positions);
+    context.applyConstraints(1e-5);
    State initialState = context.getState(State::Forces | State::Energy);
    LocalEnergyMinimizer::minimize(context, tolerance);
    State finalState = context.getState(State::Forces | State::Energy | State::Positions);

--- a/plugins/amoeba/openmmapi/include/openmm/AmoebaTorsionTorsionForce.h
+++ b/plugins/amoeba/openmmapi/include/openmm/AmoebaTorsionTorsionForce.h
@@ -126,13 +126,15 @@ public:
     * Set the torsion-torsion grid at the specified index
     *
     * @param index         the index of the torsion-torsion for which to get parameters
-     * @param grid          grid
+     * @param grid          either 3 or 6 values may be specified per grid point.  If the derivatives
+     *                      are omitted, they are calculated automatically by fitting a 2D spline to
+     *                      the energies.
     *                         grid[x][y][0] = x value
     *                         grid[x][y][1] = y value
-     *                         grid[x][y][2] = function value
-     *                         grid[x][y][3] = dfdx value
-     *                         grid[x][y][4] = dfdy value
-     *                         grid[x][y][5] = dfd(xy) value
+     *                         grid[x][y][2] = energy
+     *                         grid[x][y][3] = dEdx value
+     *                         grid[x][y][4] = dEdy value
+     *                         grid[x][y][5] = dEd(xy) value
     */
    void setTorsionTorsionGrid(int index, const std::vector<std::vector<std::vector<double> > >& grid);

@@ -181,29 +183,7 @@ public:
        _spacing[0]     = _spacing[1]     = 1.0;
    }

-    TorsionTorsionGridInfo(const TorsionTorsionGrid& grid) {
-
-        _grid.resize(grid.size());
-        for(unsigned int kk = 0; kk < grid.size(); kk++){
-            _grid[kk].resize(grid[kk].size());
-            for(unsigned int jj = 0; jj < grid[kk].size(); jj++){
-                _grid[kk][jj].resize(grid[kk][jj].size());
-                for(unsigned int ii = 0; ii < grid[kk][jj].size(); ii++){
-                    _grid[kk][jj][ii] = grid[kk][jj][ii];
-                }
-            }
-        }
-
-        _startValues[0] =  _grid[0][0][0];
-        _startValues[1] =  _grid[0][0][1];
-
-        _spacing[0]     = static_cast<double>(_grid.size()-1)/360.0;
-        _spacing[1]     = static_cast<double>(grid.size()-1)/360.0;
-
-        _size[0]        = static_cast<int>(grid.size());
-        _size[1]        = static_cast<int>(grid[0].size());
-
-    }
+    TorsionTorsionGridInfo(const TorsionTorsionGrid& grid);

    const TorsionTorsionGrid& getTorsionTorsionGrid(void) const {
        return _grid;

--- a/plugins/amoeba/openmmapi/src/AmoebaTorsionTorsionForce.cpp
+++ b/plugins/amoeba/openmmapi/src/AmoebaTorsionTorsionForce.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2008-2009 Stanford University and the Authors.      *
+ * Portions copyright (c) 2008-2014 Stanford University and the Authors.      *
 * Authors:                                                                   *
 * Contributors:                                                              *
 *                                                                            *
@@ -33,8 +33,10 @@
 #include "openmm/OpenMMException.h"
 #include "openmm/AmoebaTorsionTorsionForce.h"
 #include "openmm/internal/AmoebaTorsionTorsionForceImpl.h"
+#include "openmm/internal/SplineFitter.h"

 using namespace OpenMM;
+using namespace std;

 AmoebaTorsionTorsionForce::AmoebaTorsionTorsionForce() {
 }
@@ -82,3 +84,102 @@ void AmoebaTorsionTorsionForce::setTorsionTorsionGrid(int index, const TorsionTo
 ForceImpl* AmoebaTorsionTorsionForce::createImpl() const {
    return new AmoebaTorsionTorsionForceImpl(*this);
 }
+
+AmoebaTorsionTorsionForce::TorsionTorsionGridInfo::TorsionTorsionGridInfo(const TorsionTorsionGrid& grid) {
+    if (grid[0][0][0] != grid[1][0][0])
+        _grid = grid;
+    else {
+        // We need to transpose the grid.
+        
+        int xsize = grid[0].size();
+        int ysize = grid.size();
+        _grid.resize(xsize);
+        for (int i = 0; i < xsize; i++) {
+            _grid[i].resize(ysize);
+            for (int j = 0; j < ysize; j++)
+                _grid[i][j] = grid[j][i];
+        }
+    }
+
+    _startValues[0] =  _grid[0][0][0];
+    _startValues[1] =  _grid[0][0][1];
+
+    _spacing[0]     = static_cast<double>(_grid.size()-1)/360.0;
+    _spacing[1]     = static_cast<double>(_grid.size()-1)/360.0;
+
+    _size[0]        = static_cast<int>(_grid.size());
+    _size[1]        = static_cast<int>(_grid[0].size());
+    
+    if (_grid[0][0].size() == 3) {
+        // We need to compute the derivatives ourselves.  First determine if the grid is periodic.
+        
+        int xsize = _size[0];
+        int ysize = _size[1];
+        bool periodic = true;
+        for (int i = 0; i < xsize; i++)
+            if (_grid[i][0][2] != _grid[i][ysize-1][2])
+                periodic = false;
+        for (int i = 0; i < ysize; i++)
+            if (_grid[0][i][2] != _grid[xsize-1][i][2])
+                periodic = false;
+        
+        // Compute derivatives with respect to the first angle.
+
+        vector<double> x(xsize), y(ysize);
+        for (int i = 0; i < xsize; i++)
+            x[i] = _grid[i][0][0];
+        for (int i = 0; i < ysize; i++)
+            y[i] = _grid[0][i][1];
+        vector<double> d1(xsize*ysize), d2(xsize*ysize), d12(xsize*ysize);
+        vector<double> t(xsize), deriv(xsize);
+        for (int i = 0; i < ysize; i++) {
+            for (int j = 0; j < xsize; j++)
+                t[j] = _grid[j][i][2];
+            if (periodic)
+                SplineFitter::createPeriodicSpline(x, t, deriv);
+            else
+                SplineFitter::createNaturalSpline(x, t, deriv);
+            for (int j = 0; j < xsize; j++)
+                d1[j+xsize*i] = SplineFitter::evaluateSplineDerivative(x, t, deriv, x[j]);
+        }
+
+        // Compute derivatives with respect to the second angle.
+
+        t.resize(ysize);
+        deriv.resize(ysize);
+        for (int i = 0; i < xsize; i++) {
+            for (int j = 0; j < ysize; j++)
+                t[j] = _grid[i][j][2];
+            if (periodic)
+                SplineFitter::createPeriodicSpline(y, t, deriv);
+            else
+                SplineFitter::createNaturalSpline(y, t, deriv);
+            for (int j = 0; j < ysize; j++)
+                d2[i+xsize*j] = SplineFitter::evaluateSplineDerivative(y, t, deriv, y[j]);
+        }
+
+        // Compute cross derivatives.
+
+        t.resize(xsize);
+        deriv.resize(xsize);
+        for (int i = 0; i < ysize; i++) {
+            for (int j = 0; j < xsize; j++)
+                t[j] = d2[j+xsize*i];
+            if (periodic)
+                SplineFitter::createPeriodicSpline(x, t, deriv);
+            else
+                SplineFitter::createNaturalSpline(x, t, deriv);
+            for (int j = 0; j < xsize; j++)
+                d12[j+xsize*i] = SplineFitter::evaluateSplineDerivative(x, t, deriv, x[j]);
+        }
+        
+        // Add the derivatives to the grid.
+        
+        for (int i = 0; i < xsize; i++)
+            for (int j = 0; j < ysize; j++) {
+                _grid[i][j].push_back(d1[i+xsize*j]);
+                _grid[i][j].push_back(d2[i+xsize*j]);
+                _grid[i][j].push_back(d12[i+xsize*j]);
+            }
+    }
+}
--- a/plugins/amoeba/platforms/reference/tests/TestReferenceAmoebaTorsionTorsionForce.cpp
+++ b/plugins/amoeba/platforms/reference/tests/TestReferenceAmoebaTorsionTorsionForce.cpp
@@ -49,7 +49,7 @@ extern "C" OPENMM_EXPORT void registerAmoebaReferenceKernelFactories();

 const double TOL = 1e-4;

-TorsionTorsionGrid& getTorsionGrid( int gridIndex ) {
+TorsionTorsionGrid getTorsionGrid(int gridIndex, bool includeDerivs) {

 static double grid[4][625][6] = { 
    {
@@ -2557,35 +2557,32 @@ static double grid[4][625][6] = {
 {     165.0000,     180.0000, -0.182999000E+01,  0.377952854E-01,  0.233583295E-01, -0.109828932E-02 }, 
 {     180.0000,     180.0000, -0.146854000E+01,  0.491175487E-02,  0.195601580E-02, -0.163177030E-02 }  } };

-    // static std::vector< std::vector< std::vector<double> > > TorsionTorsionGrid
-    static std::vector<TorsionTorsionGrid> grids;
-    if( grids.size() == 0 ){
-        grids.resize(4);
-        for( int ii = 0; ii < 4; ii++ ){
-            grids[ii].resize( 25 );
-            for( int jj = 0; jj < 25; jj++ ){
-                grids[ii][jj].resize(25);
-                for( int kk = 0; kk < 25; kk++ ){
-                    grids[ii][jj][kk].resize(6);
-                }
+    int elementCount = (includeDerivs ? 6 : 3);
+    std::vector<TorsionTorsionGrid> grids(4);
+    for( int ii = 0; ii < 4; ii++ ){
+        grids[ii].resize( 25 );
+        for( int jj = 0; jj < 25; jj++ ){
+            grids[ii][jj].resize(25);
+            for( int kk = 0; kk < 25; kk++ ){
+                grids[ii][jj][kk].resize(elementCount);
            }
-            int index = 0;
-            for( int jj = 0; jj < 25; jj++ ){
-                for( int kk = 0; kk < 25; kk++ ){
-                    int jjIndex = static_cast<int>(((grid[ii][index][0] + 180.0)/15.0)+1.0e-05);
-                    int kkIndex = static_cast<int>(((grid[ii][index][1] + 180.0)/15.0)+1.0e-05);
-                    for( int ll = 0; ll < 6; ll++ ){
-                        grids[ii][kk][jj][ll] = grid[ii][index][ll];
-                    }
-                    index++;
+        }
+        int index = 0;
+        for( int jj = 0; jj < 25; jj++ ){
+            for( int kk = 0; kk < 25; kk++ ){
+                int jjIndex = static_cast<int>(((grid[ii][index][0] + 180.0)/15.0)+1.0e-05);
+                int kkIndex = static_cast<int>(((grid[ii][index][1] + 180.0)/15.0)+1.0e-05);
+                for( int ll = 0; ll < elementCount; ll++ ){
+                    grids[ii][kk][jj][ll] = grid[ii][index][ll];
                }
+                index++;
            }
        }
    }
    return grids[gridIndex];
 }

-void testTorsionTorsion( FILE* log, int systemId ) {
+void testTorsionTorsion(int systemId, bool includeDerivs) {

    System system;
    int numberOfParticles = 6;
@@ -2645,11 +2642,11 @@ void testTorsionTorsion( FILE* log, int systemId ) {

        expectedEnergy        = -3.372536909E+00;
    }
-    amoebaTorsionTorsionForce->addTorsionTorsion( 0, 1, 2, 3, 4, chiralCheckAtomIndex, 0 );
-    amoebaTorsionTorsionForce->setTorsionTorsionGrid( 0, getTorsionGrid( gridIndex ) );
+    amoebaTorsionTorsionForce->addTorsionTorsion(0, 1, 2, 3, 4, chiralCheckAtomIndex, 0);
+    amoebaTorsionTorsionForce->setTorsionTorsionGrid(0, getTorsionGrid(gridIndex, includeDerivs));
    
    system.addForce(amoebaTorsionTorsionForce);
-    Context context(system, integrator, Platform::getPlatformByName( "Reference"));
+    Context context(system, integrator, Platform::getPlatformByName("Reference"));

    context.setPositions(positions);
    State state                      = context.getState(State::Forces | State::Energy);
@@ -2662,18 +2659,6 @@ void testTorsionTorsion( FILE* log, int systemId ) {
        forces[ii][2] *= conversion;
    }

-#ifdef AMOEBA_DEBUG
-    if( log ){
-        (void) fprintf( log, "computeAmoebaTorsionTorsionForces: expected energy=%14.7e %14.7e\n", expectedEnergy, state.getPotentialEnergy() );
-        for( unsigned int ii = 0; ii < forces.size(); ii++ ){
-            (void) fprintf( log, "%6u [%14.7e %14.7e %14.7e]   [%14.7e %14.7e %14.7e]\n", ii,
-                            expectedForces[ii][0], expectedForces[ii][1], expectedForces[ii][2],
-                            forces[ii][0], forces[ii][1], forces[ii][2] );
-        }
-        (void) fflush( log );
-    }
-#endif
-
    double tolerance = 1.0e-03;
    for( unsigned int ii = 0; ii < forces.size(); ii++ ){
        ASSERT_EQUAL_VEC( expectedForces[ii], forces[ii], tolerance );
@@ -2687,10 +2672,8 @@ int main( int numberOfArguments, char* argv[] ) {
    try {
        std::cout << "TestReferenceAmoebaTorsionTorsionForce running test..." << std::endl;
        registerAmoebaReferenceKernelFactories();
-        //registerAmoebaCudaKernelFactories();
-
-        FILE* log = NULL;
-        testTorsionTorsion( log, 1 );
+        testTorsionTorsion(1, true);
+        testTorsionTorsion(1, false);
    } catch(const std::exception& e) {
        std::cout << "exception: " << e.what() << std::endl;
        std::cout << "FAIL - ERROR.  Test failed." << std::endl;

--- a/plugins/drude/openmmapi/src/DrudeLangevinIntegrator.cpp
+++ b/plugins/drude/openmmapi/src/DrudeLangevinIntegrator.cpp
@@ -33,8 +33,8 @@
 #include "openmm/Context.h"
 #include "openmm/OpenMMException.h"
 #include "openmm/internal/ContextImpl.h"
+#include "openmm/internal/OSRngSeed.h"
 #include "openmm/DrudeKernels.h"
-#include <cmath>
 #include <ctime>
 #include <string>

@@ -49,7 +49,7 @@ DrudeLangevinIntegrator::DrudeLangevinIntegrator(double temperature, double fric
    setDrudeFriction(drudeFrictionCoeff);
    setStepSize(stepSize);
    setConstraintTolerance(1e-5);
-    setRandomNumberSeed((int) time(NULL));
+    setRandomNumberSeed(osrngseed());
 }

 void DrudeLangevinIntegrator::initialize(ContextImpl& contextRef) {

--- a/plugins/rpmd/openmmapi/src/RPMDIntegrator.cpp
+++ b/plugins/rpmd/openmmapi/src/RPMDIntegrator.cpp
@@ -33,10 +33,10 @@
 #include "openmm/Context.h"
 #include "openmm/OpenMMException.h"
 #include "openmm/internal/ContextImpl.h"
+#include "openmm/internal/OSRngSeed.h"
 #include "openmm/RpmdKernels.h"
 #include "SimTKOpenMMRealType.h"
 #include <cmath>
-#include <ctime>
 #include <string>

 using namespace OpenMM;
@@ -48,7 +48,7 @@ RPMDIntegrator::RPMDIntegrator(int numCopies, double temperature, double frictio
    setFriction(frictionCoeff);
    setStepSize(stepSize);
    setConstraintTolerance(1e-5);
-    setRandomNumberSeed((int) time(NULL));
+    setRandomNumberSeed(osrngseed());
 }

 RPMDIntegrator::RPMDIntegrator(int numCopies, double temperature, double frictionCoeff, double stepSize) :
@@ -57,7 +57,7 @@ RPMDIntegrator::RPMDIntegrator(int numCopies, double temperature, double frictio
    setFriction(frictionCoeff);
    setStepSize(stepSize);
    setConstraintTolerance(1e-5);
-    setRandomNumberSeed((int) time(NULL));
+    setRandomNumberSeed(osrngseed());
 }

 void RPMDIntegrator::initialize(ContextImpl& contextRef) {

--- a/serialization/include/openmm/serialization/XmlSerializer.h
+++ b/serialization/include/openmm/serialization/XmlSerializer.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2010 Stanford University and the Authors.           *
+ * Portions copyright (c) 2010-2014 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -79,7 +79,7 @@ public:
 private:
    static void serialize(const SerializationNode& node, std::ostream& stream);
    static void* deserializeStream(std::istream& stream);
-    static TiXmlElement* encodeNode(const SerializationNode& node);
+    static void encodeNode(const SerializationNode& node, std::ostream& stream, int depth);
    static void decodeNode(SerializationNode& node, const TiXmlElement& element);
 };


--- a/serialization/src/XmlSerializer.cpp
+++ b/serialization/src/XmlSerializer.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2010 Stanford University and the Authors.           *
+ * Portions copyright (c) 2010-2014 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -36,25 +36,32 @@ using namespace OpenMM;
 using namespace std;

 void XmlSerializer::serialize(const SerializationNode& node, std::ostream& stream) {
-    TiXmlDocument doc;
-    TiXmlDeclaration* decl = new TiXmlDeclaration( "1.0", "", "" );
-    doc.LinkEndChild(decl);
-    doc.LinkEndChild(encodeNode(node));
-    TiXmlPrinter printer;
-    printer.SetIndent("\t");
-    doc.Accept(&printer);
-    stream << printer.Str();
+    stream << "<?xml version=\"1.0\" ?>\n";
+    encodeNode(node, stream, 0);
 }

-TiXmlElement* XmlSerializer::encodeNode(const SerializationNode& node) {
-    TiXmlElement* element = new TiXmlElement(node.getName());
+void XmlSerializer::encodeNode(const SerializationNode& node, std::ostream& stream, int depth) {
+    for (int i = 0; i < depth; i++)
+        stream << '\t';
+    stream << '<' << node.getName();
    const map<string, string>& properties = node.getProperties();
-    for (map<string, string>::const_iterator iter = properties.begin(); iter != properties.end(); ++iter)
-        element->SetAttribute(iter->first.c_str(), iter->second.c_str());
+    for (map<string, string>::const_iterator iter = properties.begin(); iter != properties.end(); ++iter) {
+        string name, value;
+        TiXmlBase::EncodeString(iter->first, &name);
+        TiXmlBase::EncodeString(iter->second, &value);
+        stream << ' ' << name << "=\"" << value << '\"';
+    }
    const vector<SerializationNode>& children = node.getChildren();
-    for (int i = 0; i < (int) children.size(); i++)
-        element->LinkEndChild(encodeNode(children[i]));
-    return element;
+    if (children.size() == 0)
+        stream << "/>\n";
+    else {
+        stream << ">\n";
+        for (int i = 0; i < (int) children.size(); i++)
+            encodeNode(children[i], stream, depth+1);
+        for (int i = 0; i < depth; i++)
+            stream << '\t';
+        stream << "</" << node.getName() << ">\n";
+    }
 }

 void* XmlSerializer::deserializeStream(std::istream& stream) {

--- a/tests/TestFindExclusions.cpp
+++ b/tests/TestFindExclusions.cpp
@@ -30,6 +30,7 @@
 * -------------------------------------------------------------------------- */

 #include "openmm/internal/AssertionUtilities.h"
+#include "openmm/CustomNonbondedForce.h"
 #include "openmm/NonbondedForce.h"
 #include "openmm/OpenMMException.h"
 #include <iostream>
@@ -166,10 +167,62 @@ void testReplaceExceptions() {
    ASSERT(charge == 5.0);
 }

+/**
+ * This is the same as testFindExceptions(), except it tests adding exclusions to a CustomNonbondedForce.
+ */
+
+void testFindCustomExclusions() {
+    CustomNonbondedForce nonbonded("r");
+    vector<pair<int, int> > bonds;
+    vector<double> params;
+    for (int i = 0; i < NUM_ATOMS; i++)
+        nonbonded.addParticle(params);
+    // loop over all main-chain atoms (even numbered atoms)
+    for (int i = 0; i < NUM_ATOMS-1; i += 2)
+    {
+        // side-chain bonds
+        bonds.push_back(pair<int, int>(i, i+1));
+        // main-chain bonds
+        if (i < NUM_ATOMS-2) // penultimate atom (NUM_ATOMS-2) has no subsequent main-chain atom
+            bonds.push_back(pair<int, int>(i, i+2));
+    }
+    nonbonded.createExclusionsFromBonds(bonds, 3);
+
+    // Build lists of the expected exclusions.
+
+    vector<set<int> > expectedExclusions(NUM_ATOMS);
+    int totalExclusions = 0;
+    for (int i = 0; i < NUM_ATOMS; i += 2) {
+        addAtomsToExclusions(i, i+1, expectedExclusions, totalExclusions);
+        addAtomsToExclusions(i, i+2, expectedExclusions, totalExclusions);
+        addAtomsToExclusions(i, i+3, expectedExclusions, totalExclusions);
+        addAtomsToExclusions(i, i+4, expectedExclusions, totalExclusions);
+        addAtomsToExclusions(i+1, i+2, expectedExclusions, totalExclusions);
+        addAtomsToExclusions(i, i+5, expectedExclusions, totalExclusions);
+        addAtomsToExclusions(i, i+6, expectedExclusions, totalExclusions);
+        addAtomsToExclusions(i+1, i+3, expectedExclusions, totalExclusions);
+        addAtomsToExclusions(i+1, i+4, expectedExclusions, totalExclusions);
+    }
+    for (int i = 0; i < nonbonded.getNumExclusions(); i++) {
+        int particle1, particle2;
+        nonbonded.getExclusionParticles(i, particle1, particle2);
+    }
+
+    // Compare them to the exceptions that were generated.
+
+    ASSERT_EQUAL(totalExclusions, nonbonded.getNumExclusions());
+    for (int i = 0; i < nonbonded.getNumExclusions(); i++) {
+        int particle1, particle2;
+        nonbonded.getExclusionParticles(i, particle1, particle2);
+        ASSERT(expectedExclusions[particle1].find(particle2) != expectedExclusions[particle1].end());
+    }
+}
+
 int main() {
    try {
        testFindExceptions();
        testReplaceExceptions();
+        testFindCustomExclusions();
    }
    catch(const exception& e) {
        cout << "exception: " << e.what() << endl;

--- a/wrappers/python/CMakeLists.txt
+++ b/wrappers/python/CMakeLists.txt
@@ -55,8 +55,12 @@ foreach(SUBDIR ${SUBDIRS})
        "${CMAKE_CURRENT_SOURCE_DIR}/${SUBDIR}/*.xml"
        "${CMAKE_CURRENT_SOURCE_DIR}/${SUBDIR}/*.pdb"
        "${CMAKE_CURRENT_SOURCE_DIR}/${SUBDIR}/*.prmtop"
+        "${CMAKE_CURRENT_SOURCE_DIR}/${SUBDIR}/*.parm7"
+        "${CMAKE_CURRENT_SOURCE_DIR}/${SUBDIR}/*.rst7"
+        "${CMAKE_CURRENT_SOURCE_DIR}/${SUBDIR}/*.ncrst"
        "${CMAKE_CURRENT_SOURCE_DIR}/${SUBDIR}/*.dms"
        "${CMAKE_CURRENT_SOURCE_DIR}/${SUBDIR}/*.top"
+        "${CMAKE_CURRENT_SOURCE_DIR}/${SUBDIR}/*.par"
        "${CMAKE_CURRENT_SOURCE_DIR}/${SUBDIR}/*psf"
        "${CMAKE_CURRENT_SOURCE_DIR}/${SUBDIR}/charmm22.*"
    )

--- a/wrappers/python/simtk/openmm/app/amberinpcrdfile.py
+++ b/wrappers/python/simtk/openmm/app/amberinpcrdfile.py
@@ -6,9 +6,9 @@ Simbios, the NIH National Center for Physics-Based Simulation of
 Biological Structures at Stanford, funded under the NIH Roadmap for
 Medical Research, grant U54 GM072970. See https://simtk.org.

-Portions copyright (c) 2012 Stanford University and the Authors.
+Portions copyright (c) 2012-2014 Stanford University and the Authors.
 Authors: Peter Eastman
-Contributors:
+Contributors: Jason Swails

 Permission is hereby granted, free of charge, to any person obtaining a
 copy of this software and associated documentation files (the "Software"),
@@ -31,87 +31,97 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 __author__ = "Peter Eastman"
 __version__ = "1.0"

+from functools import wraps
 from simtk.openmm.app.internal import amber_file_parser
 from simtk.unit import Quantity, nanometers, picoseconds
+import warnings
 try:
-    import numpy
+    import numpy as np
 except:
-    pass
+    np = None
+
+def numpy_protector(func):
+    """
+    Decorator to emit useful error messages if users try to request numpy
+    processing if numpy is not available. Raises ImportError if numpy could not
+    be found
+    """
+    @wraps(func)
+    def wrapper(self, asNumpy=False):
+        if asNumpy and np is None:
+            raise ImportError('Could not import numpy. Cannot set asNumpy=True')
+        return func(self, asNumpy=asNumpy)
+    return wrapper

 class AmberInpcrdFile(object):
    """AmberInpcrdFile parses an AMBER inpcrd file and loads the data stored in it."""

-    def __init__(self, file, loadVelocities=False, loadBoxVectors=False):
+    def __init__(self, file, loadVelocities=None, loadBoxVectors=None):
        """Load an inpcrd file.

        An inpcrd file contains atom positions and, optionally, velocities and periodic box dimensions.
-        Unfortunately, it is sometimes impossible to determine from the file itself exactly what data
-        it contains.  You therefore must specify in advance what data to load.  It is stored into this
-        object's "positions", "velocities", and "boxVectors" fields.

        Parameters:
         - file (string) the name of the file to load
-         - loadVelocities (boolean=False) whether to load velocities from the file
-         - loadBoxVectors (boolean=False) whether to load the periodic box vectors
+         - loadVelocities (boolean=None) deprecated. Velocities are loaded automatically if present
+         - loadBoxVectors (boolean=None) deprecated. Box vectors are loaded automatically if present
        """
-        results = amber_file_parser.readAmberCoordinates(file, read_velocities=loadVelocities, read_box=loadBoxVectors)
-        if loadVelocities:
-            ## The atom positions read from the inpcrd file
-            self.positions = results[0]
-            if loadBoxVectors:
-                ## The periodic box vectors read from the inpcrd file
-                self.boxVectors = results[1]
-                ## The atom velocities read from the inpcrd file
-                self.velocities = results[2]
-            else:
-                self.velocities = results[1]
-        elif loadBoxVectors:
-            self.positions = results[0]
-            self.boxVectors = results[1]
-        else:
-            self.positions = results
+        self.file = file
+        if loadVelocities is not None or loadBoxVectors is not None:
+            warnings.warn('loadVelocities and loadBoxVectors have been '
+                          'deprecated. velocities and box information '
+                          'is loaded automatically if the inpcrd file contains '
+                          'them.', DeprecationWarning)
+        results = amber_file_parser.readAmberCoordinates(file)
+        self.positions, self.velocities, self.boxVectors = results
+        # Cached numpy arrays
        self._numpyPositions = None
-        if loadVelocities:
-            self._numpyVelocities = None
-        if loadBoxVectors:
-            self._numpyBoxVectors = None
+        self._numpyVelocities = None
+        self._numpyBoxVectors = None

+    @numpy_protector
    def getPositions(self, asNumpy=False):
        """Get the atomic positions.

        Parameters:
         - asNumpy (boolean=False) if true, the values are returned as a numpy array instead of a list of Vec3s
-         """
+        """
        if asNumpy:
            if self._numpyPositions is None:
-                self._numpyPositions = Quantity(numpy.array(self.positions.value_in_unit(nanometers)), nanometers)
+                self._numpyPositions = Quantity(np.array(self.positions.value_in_unit(nanometers)), nanometers)
            return self._numpyPositions
        return self.positions

+    @numpy_protector
    def getVelocities(self, asNumpy=False):
        """Get the atomic velocities.

        Parameters:
         - asNumpy (boolean=False) if true, the vectors are returned as numpy arrays instead of Vec3s
-         """
+        """
+        if self.velocities is None:
+            raise AttributeError('velocities not found in %s' % self.file)
        if asNumpy:
            if self._numpyVelocities is None:
-                self._numpyVelocities = Quantity(numpy.array(self.velocities.value_in_unit(nanometers/picoseconds)), nanometers/picoseconds)
+                self._numpyVelocities = Quantity(np.array(self.velocities.value_in_unit(nanometers/picoseconds)), nanometers/picoseconds)
            return self._numpyVelocities
        return self.velocities

+    @numpy_protector
    def getBoxVectors(self, asNumpy=False):
        """Get the periodic box vectors.

        Parameters:
         - asNumpy (boolean=False) if true, the values are returned as a numpy array instead of a list of Vec3s
-         """
+        """
+        if self.boxVectors is None:
+            raise AttributeError('Box information not found in %s' % self.file)
        if asNumpy:
            if self._numpyBoxVectors is None:
                self._numpyBoxVectors = []
-                self._numpyBoxVectors.append(Quantity(numpy.array(self.boxVectors[0].value_in_unit(nanometers)), nanometers))
-                self._numpyBoxVectors.append(Quantity(numpy.array(self.boxVectors[1].value_in_unit(nanometers)), nanometers))
-                self._numpyBoxVectors.append(Quantity(numpy.array(self.boxVectors[2].value_in_unit(nanometers)), nanometers))
+                self._numpyBoxVectors.append(Quantity(np.array(self.boxVectors[0].value_in_unit(nanometers)), nanometers))
+                self._numpyBoxVectors.append(Quantity(np.array(self.boxVectors[1].value_in_unit(nanometers)), nanometers))
+                self._numpyBoxVectors.append(Quantity(np.array(self.boxVectors[2].value_in_unit(nanometers)), nanometers))
            return self._numpyBoxVectors
        return self.boxVectors