Merge branch 'master' of https://github.com/SimTk/openmm

6160b726 · Lee-Ping Wang · 8df54762 · d59a34fb · 6160b726 · 6160b726
Commit 6160b726 authored Nov 19, 2013 by Lee-Ping Wang
20 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -105,15 +105,16 @@ ELSE( CMAKE_SIZEOF_VOID_P EQUAL 8 )
  SET( LIB64  )
 ENDIF( CMAKE_SIZEOF_VOID_P EQUAL 8 )
-# Build universal binaries compatible with OS X 10.5
+# Build universal binaries compatible with OS X 10.7
 IF (APPLE AND NOT CMAKE_OSX_DEPLOYMENT_TARGET)
    SET (CMAKE_OSX_ARCHITECTURES "i386;x86_64" CACHE STRING "The processor architectures to build for" FORCE)
-    SET (CMAKE_OSX_DEPLOYMENT_TARGET "10.5" CACHE STRING "The minimum version of OS X to support" FORCE)
+    SET (CMAKE_OSX_DEPLOYMENT_TARGET "10.7" CACHE STRING "The minimum version of OS X to support" FORCE)
 ENDIF (APPLE AND NOT CMAKE_OSX_DEPLOYMENT_TARGET)
 # Improve the linking behavior of Mac libraries
 IF (APPLE)
    SET (CMAKE_INSTALL_NAME_DIR "@rpath")
+    SET_PROPERTY(GLOBAL PROPERTY COMPILE_FLAGS "-stdlib=libc++ -mmacosx-version-min=10.7")
 ENDIF (APPLE)
 IF(UNIX AND NOT CMAKE_BUILD_TYPE)

--- a/cmake_modules/FindOpenCL.cmake
+++ b/cmake_modules/FindOpenCL.cmake
@@ -16,6 +16,15 @@ find_path(OPENCL_INCLUDE_DIR
    PATH_SUFFIXES "include"
    NO_DEFAULT_PATH
 )
+# On Macs, look inside the platform SDK
+if(DEFINED CMAKE_OSX_SYSROOT)
+    find_path(OPENCL_INCLUDE_DIR
+        NAMES opencl.h opencl.h
+        PATHS
+            "${CMAKE_OSX_SYSROOT}/System/Library/Frameworks/OpenCL.framework/Headers"
+        NO_DEFAULT_PATH
+    )
+endif(DEFINED CMAKE_OSX_SYSROOT)
 # As a last resort, look in default system areas followed by other possible locations
 find_path(OPENCL_INCLUDE_DIR
    NAMES OpenCL/opencl.h CL/opencl.h

--- a/examples/5dfr_minimized.pdb
+++ b/examples/5dfr_minimized.pdb
--- a/examples/5dfr_solv-cube_equil.pdb
+++ b/examples/5dfr_solv-cube_equil.pdb
--- a/examples/benchmark.py
+++ b/examples/benchmark.py
+from __future__ import print_function
+import simtk.openmm.app as app
+import simtk.openmm as mm
+import simtk.unit as unit
+import sys
+from datetime import datetime
+from optparse import OptionParser
+def timeIntegration(context, steps):
+    """Integrate a Context for a specified number of steps, then return how many seconds it took."""
+    context.getIntegrator().step(5) # Make sure everything is fully initialized
+    context.getState(getEnergy=True)
+    start = datetime.now()
+    context.getIntegrator().step(steps)
+    context.getState(getEnergy=True)
+    end = datetime.now()
+    elapsed = end -start
+    return elapsed.seconds + elapsed.microseconds*1e-6
+def runOneTest(testName, options):
+    """Perform a single benchmarking simulation."""
+    explicit = (testName in ('rf', 'pme', 'amoebapme'))
+    amoeba = (testName in ('amoebagk', 'amoebapme'))
+    hydrogenMass = None
+    print()
+    if amoeba:
+        print('Test: %s (epsilon=%g)' % (testName, options.epsilon))
+    elif testName == 'pme':
+        print('Test: pme (cutoff=%g)' % options.cutoff)
+    else:
+        print('Test: %s' % testName)
+    platform = mm.Platform.getPlatformByName(options.platform)
+    # Create the System.
+    if amoeba:
+        constraints = None
+        epsilon = float(options.epsilon)
+        if epsilon == 0:
+            polarization = 'direct'
+        else:
+            polarization = 'mutual'
+        if explicit:
+            ff = app.ForceField('amoeba2009.xml')
+            pdb = app.PDBFile('5dfr_solv-cube_equil.pdb')
+            cutoff = 0.7*unit.nanometers
+            vdwCutoff = 0.9*unit.nanometers
+            system = ff.createSystem(pdb.topology, nonbondedMethod=app.PME, nonbondedCutoff=cutoff, vdwCutoff=vdwCutoff, constraints=constraints, ewaldErrorTolerance=0.00075, mutualInducedTargetEpsilon=epsilon, polarization=polarization)
+        else:
+            ff = app.ForceField('amoeba2009.xml', 'amoeba2009_gk.xml')
+            pdb = app.PDBFile('5dfr_minimized.pdb')
+            cutoff = 2.0*unit.nanometers
+            vdwCutoff = 1.2*unit.nanometers
+            system = ff.createSystem(pdb.topology, nonbondedMethod=app.NoCutoff, constraints=constraints, mutualInducedTargetEpsilon=epsilon, polarization=polarization)
+        dt = 0.001*unit.picoseconds
+    else:
+        if explicit:
+            ff = app.ForceField('amber99sb.xml', 'tip3p.xml')
+            pdb = app.PDBFile('5dfr_solv-cube_equil.pdb')
+            if testName == 'pme':
+                method = app.PME
+                cutoff = options.cutoff
+            else:
+                method = app.CutoffPeriodic
+                cutoff = 1*unit.nanometers
+        else:
+            ff = app.ForceField('amber99sb.xml', 'amber99_obc.xml')
+            pdb = app.PDBFile('5dfr_minimized.pdb')
+            method = app.CutoffNonPeriodic
+            cutoff = 2*unit.nanometers
+        if options.heavy:
+            dt = 0.005*unit.picoseconds
+            constraints = app.AllBonds
+            hydrogenMass = 4*unit.amu
+        else:
+            dt = 0.002*unit.picoseconds
+            constraints = app.HBonds
+            hydrogenMass = None
+        system = ff.createSystem(pdb.topology, nonbondedMethod=method, nonbondedCutoff=cutoff, constraints=constraints, hydrogenMass=hydrogenMass)
+    print('Step Size: %g fs' % dt.value_in_unit(unit.femtoseconds))
+    properties = {}
+    if options.device is not None:
+        if platform.getName() == 'CUDA':
+            properties['CudaDeviceIndex'] = options.device
+        elif platform.getName() == 'OpenCL':
+            properties['OpenCLDeviceIndex'] = options.device
+    if options.precision is not None:
+        if platform.getName() == 'CUDA':
+            properties['CudaPrecision'] = options.precision
+        elif platform.getName() == 'OpenCL':
+            properties['OpenCLPrecision'] = options.device
+    # Run the simulation.
+    integ = mm.LangevinIntegrator(300*unit.kelvin, 91*(1/unit.picoseconds), dt)
+    integ.setConstraintTolerance(1e-5)
+    if len(properties) > 0:
+        context = mm.Context(system, integ, platform, properties)
+    else:
+        context = mm.Context(system, integ, platform)
+    context.setPositions(pdb.positions)
+    context.setVelocitiesToTemperature(300*unit.kelvin)
+    steps = 20
+    while True:
+        time = timeIntegration(context, steps)
+        if time >= 0.5*options.seconds:
+            break
+        if time < 0.5:
+            steps = int(steps*1.0/time) # Integrate enough steps to get a reasonable estimate for how many we'll need.
+        else:
+            steps = int(steps*options.seconds/time)
+    print('Integrated %d steps in %g seconds' % (steps, time))
+    print('%g ns/day' % (dt*steps*86400/time).value_in_unit(unit.nanoseconds))
+# Parse the command line options.
+parser = OptionParser()
+platformNames = [mm.Platform.getPlatform(i).getName() for i in range(mm.Platform.getNumPlatforms())]
+parser.add_option('--platform', dest='platform', choices=platformNames, help='name of the platform to benchmark')
+parser.add_option('--test', dest='test', choices=('gbsa', 'rf', 'pme', 'amoebagk', 'amoebapme'), help='the test to perform: gbsa, rf, pme, amoebagk, or amoebapme [default: all]')
+parser.add_option('--pme-cutoff', default='0.9', dest='cutoff', type='float', help='direct space cutoff for PME in nm [default: 0.9]')
+parser.add_option('--seconds', default='60', dest='seconds', type='float', help='target simulation length in seconds [default: 60]')
+parser.add_option('--mutual-epsilon', default='1e-4', dest='epsilon', type='float', help='mutual induced epsilon for AMOEBA [default: 1e-4]')
+parser.add_option('--heavy-hydrogens', action='store_true', default=False, dest='heavy', help='repartition mass to allow a larger time step')
+parser.add_option('--device', default=None, dest='device', help='device index for CUDA or OpenCL')
+parser.add_option('--precision', default='single', dest='precision', choices=('single', 'mixed', 'double'), help='precision mode for CUDA or OpenCL: single, mixed, or double [default: single]')
+(options, args) = parser.parse_args()
+if len(args) > 0:
+    parser.error('Unknown argument: '+args[0])
+if options.platform is None:
+    parser.error('No platform specified')
+print('Platform:', options.platform)
+if options.platform in ('CUDA', 'OpenCL'):
+    print('Precision:', options.precision)
+    if options.device is not None:
+        print('Device:', options.device)
+# Run the simulations.
+if options.test is None:
+    for test in ('gbsa', 'rf', 'pme', 'amoebagk', 'amoebapme'):
+        try:
+            runOneTest(test, options)
+        except Exception as ex:
+            print('Test failed: %s' % ex.message)
+else:
+    runOneTest(options.test, options)
--- a/openmmapi/include/openmm/State.h
+++ b/openmmapi/include/openmm/State.h
@@ -34,6 +34,7 @@
 #include "Vec3.h"
 #include <map>
+#include <string>
 #include <vector>
 #include "internal/windowsExport.h"

--- a/openmmapi/include/openmm/internal/vectorize.h
+++ b/openmmapi/include/openmm/internal/vectorize.h
@@ -160,6 +160,21 @@ public:
    ivec4 operator==(ivec4 other) const {
        return _mm_cmpeq_epi32(val, other);
    }
+    ivec4 operator!=(ivec4 other) const {
+        return _mm_xor_si128(*this==other, _mm_set1_epi32(0xFFFFFFFF));
+    }
+    ivec4 operator>(ivec4 other) const {
+        return _mm_cmpgt_epi32(val, other);
+    }
+    ivec4 operator<(ivec4 other) const {
+        return _mm_cmplt_epi32(val, other);
+    }
+    ivec4 operator>=(ivec4 other) const {
+        return _mm_xor_si128(_mm_cmplt_epi32(val, other), _mm_set1_epi32(0xFFFFFFFF));
+    }
+    ivec4 operator<=(ivec4 other) const {
+        return _mm_xor_si128(_mm_cmpgt_epi32(val, other), _mm_set1_epi32(0xFFFFFFFF));
+    }
    operator fvec4() const;
 };
@@ -230,6 +245,10 @@ static inline ivec4 abs(ivec4 v) {
    return ivec4(_mm_abs_epi32(v.val));
 }
+static inline bool any(ivec4 v) {
+    return !_mm_test_all_zeros(v, _mm_set1_epi32(0xFFFFFFFF));
+}
 // Mathematical operators involving a scalar and a vector.
 static inline fvec4 operator+(float v1, fvec4 v2) {

--- a/platforms/cpu/include/CpuGBSAOBCForce.h
+++ b/platforms/cpu/include/CpuGBSAOBCForce.h
+/* Portions copyright (c) 2006-2013 Stanford University and Simbios.
+ * Contributors: Pande Group
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject
+ * to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#ifndef OPENMM_CPU_GBSAOBC_FORCE_H__
+#define OPENMM_CPU_GBSAOBC_FORCE_H__
+#include "openmm/internal/ThreadPool.h"
+#include "openmm/internal/vectorize.h"
+#include <set>
+#include <utility>
+#include <vector>
+namespace OpenMM {
+class CpuGBSAOBCForce {
+public:
+    class ComputeTask;
+    CpuGBSAOBCForce();
+    /**
+     * Set the force to use a cutoff.
+     * 
+     * @param distance    the cutoff distance
+     */
+    void setUseCutoff(float distance);
+    /**
+     * 
+     * Set the force to use periodic boundary conditions.  This requires that a cutoff has
+     * already been set, and the smallest side of the periodic box is at least twice the cutoff
+     * distance.
+     *
+     * @param boxSize             the X, Y, and Z widths of the periodic box
+     */
+    void setPeriodic(float* periodicBoxSize);
+    /**
+     * Set the solute dielectric constant.
+     */
+    void setSoluteDielectric(float dielectric);
+    /**
+     * Set the solvent dielectric constant.
+     */
+    void setSolventDielectric(float dielectric);
+    /**
+     * Get the per-particle parameters (offset radius, scaled radius).
+     */
+    const std::vector<std::pair<float, float> >& getParticleParameters() const;
+    /**
+     * Set the per-particle parameters (offset radius, scaled radius).
+     */
+    void setParticleParameters(const std::vector<std::pair<float, float> >& params);
+    /**
+     * 
+     * Calculate LJ Coulomb pair ixn
+     *
+     * @param posq             atom coordinates and charges
+     * @param forces           force array (forces added)
+     * @param totalEnergy      total energy
+     * @param threads          the thread pool to use
+     */
+    void computeForce(const std::vector<float>& posq, std::vector<std::vector<float> >& threadForce, double* totalEnergy, ThreadPool& threads);
+    /**
+     * This routine contains the code executed by each thread.
+     */
+    void threadComputeForce(ThreadPool& threads, int threadIndex);
+private:
+    bool cutoff;
+    bool periodic;
+    float periodicBoxSize[3];
+    float cutoffDistance, soluteDielectric, solventDielectric;
+    std::vector<std::pair<float, float> > particleParams;        
+    std::vector<float> bornRadii;
+    std::vector<std::vector<float> > threadBornForces;
+    std::vector<float> obcChain;
+    std::vector<double> threadEnergy;
+    std::vector<float> logTable;
+    float logDX, logDXInv;
+    // The following variables are used to make information accessible to the individual threads.
+    float const* posq;
+    std::vector<std::vector<float> >* threadForce;
+    bool includeEnergy;
+    static const int NUM_TABLE_POINTS;
+    /**
+     * Compute the displacement and squared distance between a collection of points, optionally using
+     * periodic boundary conditions.
+     */
+    void getDeltaR(const fvec4& posI, const fvec4& x, const fvec4& y, const fvec4& z, fvec4& dx, fvec4& dy, fvec4& dz, fvec4& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const;
+    /**
+     * Evaluate log(x) using a lookup table for speed.
+     */
+    fvec4 fastLog(fvec4 x);
+};
+} // namespace OpenMM
+// ---------------------------------------------------------------------------------------
+#endif // OPENMM_CPU_GBSAOBC_FORCE_H__
--- a/platforms/cpu/include/CpuKernels.h
+++ b/platforms/cpu/include/CpuKernels.h
@@ -32,22 +32,64 @@
 * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
 * -------------------------------------------------------------------------- */
-#include "CpuPlatform.h"
+#include "CpuGBSAOBCForce.h"
 #include "CpuNeighborList.h"
 #include "CpuNonbondedForce.h"
+#include "CpuPlatform.h"
 #include "openmm/kernels.h"
 #include "openmm/System.h"
-#include "openmm/internal/ThreadPool.h"
 namespace OpenMM {
+/**
+ * This kernel is invoked at the beginning and end of force and energy computations.  It gives the
+ * Platform a chance to clear buffers and do other initialization at the beginning, and to do any
+ * necessary work at the end to determine the final results.
+ */
+class CpuCalcForcesAndEnergyKernel : public CalcForcesAndEnergyKernel {
+public:
+    CpuCalcForcesAndEnergyKernel(std::string name, const Platform& platform, CpuPlatform::PlatformData& data, ContextImpl& context);
+    /**
+     * Initialize the kernel.
+     * 
+     * @param system     the System this kernel will be applied to
+     */
+    void initialize(const System& system);
+    /**
+     * This is called at the beginning of each force/energy computation, before calcForcesAndEnergy() has been called on
+     * any ForceImpl.
+     *
+     * @param context       the context in which to execute this kernel
+     * @param includeForce  true if forces should be computed
+     * @param includeEnergy true if potential energy should be computed
+     * @param groups        a set of bit flags for which force groups to include
+     */
+    void beginComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups);
+    /**
+     * This is called at the end of each force/energy computation, after calcForcesAndEnergy() has been called on
+     * every ForceImpl.
+     *
+     * @param context       the context in which to execute this kernel
+     * @param includeForce  true if forces should be computed
+     * @param includeEnergy true if potential energy should be computed
+     * @param groups        a set of bit flags for which force groups to include
+     * @return the potential energy of the system.  This value is added to all values returned by ForceImpls'
+     * calcForcesAndEnergy() methods.  That is, each force kernel may <i>either</i> return its contribution to the
+     * energy directly, <i>or</i> add it to an internal buffer so that it will be included here.
+     */
+    double finishComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups);
+private:
+    CpuPlatform::PlatformData& data;
+    Kernel referenceKernel;
+};
 /**
 * This kernel is invoked by NonbondedForce to calculate the forces acting on the system.
 */
 class CpuCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
 public:
-    CpuCalcNonbondedForceKernel(std::string name, const Platform& platform) : CalcNonbondedForceKernel(name, platform),
+    CpuCalcNonbondedForceKernel(std::string name, const Platform& platform, CpuPlatform::PlatformData& data) : CalcNonbondedForceKernel(name, platform),
-            bonded14IndexArray(NULL), bonded14ParamArray(NULL), hasInitializedPme(false) {
+            data(data), bonded14IndexArray(NULL), bonded14ParamArray(NULL), hasInitializedPme(false) {
    }
    ~CpuCalcNonbondedForceKernel();
    /**
@@ -77,6 +119,7 @@ public:
    void copyParametersToContext(ContextImpl& context, const NonbondedForce& force);
 private:
    class PmeIO;
+    CpuPlatform::PlatformData& data;
    int numParticles, num14;
    int **bonded14IndexArray;
    double **bonded14ParamArray;
@@ -85,16 +128,51 @@ private:
    bool useSwitchingFunction, useOptimizedPme, hasInitializedPme;
    std::vector<std::set<int> > exclusions;
    std::vector<std::pair<float, float> > particleParams;
-    std::vector<float> posq;
-    std::vector<float> forces;
    std::vector<RealVec> lastPositions;
    NonbondedMethod nonbondedMethod;
    CpuNeighborList neighborList;
    CpuNonbondedForce nonbonded;
-    ThreadPool threads;
    Kernel optimizedPme;
 };
+/**
+ * This kernel is invoked by GBSAOBCForce to calculate the forces acting on the system.
+ */
+class CpuCalcGBSAOBCForceKernel : public CalcGBSAOBCForceKernel {
+public:
+    CpuCalcGBSAOBCForceKernel(std::string name, const Platform& platform, CpuPlatform::PlatformData& data) : CalcGBSAOBCForceKernel(name, platform),
+            data(data) {
+    }
+    ~CpuCalcGBSAOBCForceKernel();
+    /**
+     * Initialize the kernel.
+     * 
+     * @param system     the System this kernel will be applied to
+     * @param force      the GBSAOBCForce this kernel will be used for
+     */
+    void initialize(const System& system, const GBSAOBCForce& force);
+    /**
+     * Execute the kernel to calculate the forces and/or energy.
+     *
+     * @param context        the context in which to execute this kernel
+     * @param includeForces  true if forces should be calculated
+     * @param includeEnergy  true if the energy should be calculated
+     * @return the potential energy due to the force
+     */
+    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
+    /**
+     * Copy changed parameters over to a context.
+     *
+     * @param context    the context to copy parameters to
+     * @param force      the GBSAOBCForce to copy the parameters from
+     */
+    void copyParametersToContext(ContextImpl& context, const GBSAOBCForce& force);
+private:
+    CpuPlatform::PlatformData& data;
+    std::vector<std::pair<float, float> > particleParams;
+    CpuGBSAOBCForce obc;
+};
 } // namespace OpenMM
 #endif /*OPENMM_CPUKERNELS_H_*/

--- a/platforms/cpu/include/CpuNonbondedForce.h
+++ b/platforms/cpu/include/CpuNonbondedForce.h
@@ -124,7 +124,7 @@ class CpuNonbondedForce {
      void calculateReciprocalIxn(int numberOfAtoms, float* posq, const std::vector<RealVec>& atomCoordinates,
                            const std::vector<std::pair<float, float> >& atomParameters, const std::vector<std::set<int> >& exclusions,
-                            std::vector<RealVec>& forces, float* totalEnergy) const;
+                            std::vector<RealVec>& forces, double* totalEnergy) const;
      /**---------------------------------------------------------------------------------------
@@ -143,7 +143,7 @@ class CpuNonbondedForce {
         --------------------------------------------------------------------------------------- */
      void calculateDirectIxn(int numberOfAtoms, float* posq, const std::vector<RealVec>& atomCoordinates, const std::vector<std::pair<float, float> >& atomParameters,
-            const std::vector<std::set<int> >& exclusions, float* forces, float* totalEnergy, ThreadPool& threads);
+            const std::vector<std::set<int> >& exclusions, std::vector<std::vector<float> >& threadForce, double* totalEnergy, ThreadPool& threads);
    /**
     * This routine contains the code executed by each thread.
@@ -165,7 +165,6 @@ private:
        int meshDim[3];
        std::vector<float> ewaldScaleTable;
        float ewaldDX, ewaldDXInv;
-        std::vector<std::vector<float> > threadForce;
        std::vector<double> threadEnergy;
        // The following variables are used to make information accessible to the individual threads.
        int numberOfAtoms;
@@ -173,6 +172,7 @@ private:
        RealVec const* atomCoordinates;
        std::pair<float, float> const* atomParameters;        
        std::set<int> const* exclusions;
+        std::vector<std::vector<float> >* threadForce;
        bool includeEnergy;
        static const float TWO_OVER_SQRT_PI;

--- a/platforms/cpu/include/CpuPlatform.h
+++ b/platforms/cpu/include/CpuPlatform.h
@@ -33,7 +33,10 @@
 * -------------------------------------------------------------------------- */
 #include "ReferencePlatform.h"
+#include "openmm/internal/ContextImpl.h"
+#include "openmm/internal/ThreadPool.h"
 #include "windowsExportCpu.h"
+#include <map>
 namespace OpenMM {
@@ -43,6 +46,7 @@ namespace OpenMM {
 class OPENMM_EXPORT_CPU CpuPlatform : public ReferencePlatform {
 public:
+    class PlatformData;
    CpuPlatform();
    const std::string& getName() const {
        static const std::string name = "CPU";
@@ -51,6 +55,24 @@ public:
    double getSpeed() const;
    bool supportsDoublePrecision() const;
    static bool isProcessorSupported();
+    void contextCreated(ContextImpl& context, const std::map<std::string, std::string>& properties) const;
+    void contextDestroyed(ContextImpl& context) const;
+    /**
+     * We cannot use the standard mechanism for platform data, because that is already used by the superclass.
+     * Instead, we maintain a table of ContextImpls to PlatformDatas.
+     */
+    static PlatformData& getPlatformData(ContextImpl& context);
+private:
+    static std::map<ContextImpl*, PlatformData*> contextData;
+};
+class CpuPlatform::PlatformData {
+public:
+    PlatformData(int numParticles);
+    std::vector<float> posq;
+    std::vector<std::vector<float> > threadForce;
+    ThreadPool threads;
+    bool isPeriodic;
 };
 } // namespace OpenMM

--- a/platforms/cpu/sharedTarget/CMakeLists.txt
+++ b/platforms/cpu/sharedTarget/CMakeLists.txt
-SET_SOURCE_FILES_PROPERTIES(${SOURCE_FILES} PROPERTIES COMPILE_FLAGS "-msse4.1")
+GET_PROPERTY(COMPILE_FLAGS GLOBAL PROPERTY COMPILE_FLAGS)
+SET_SOURCE_FILES_PROPERTIES(${SOURCE_FILES} PROPERTIES COMPILE_FLAGS "${COMPILE_FLAGS} -msse4.1")
 ADD_LIBRARY(${SHARED_TARGET} SHARED ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES})
 IF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
@@ -7,6 +8,6 @@ ELSE (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
    SET(MAIN_OPENMM_LIB ${OPENMM_LIBRARY_NAME})
 ENDIF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
 TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${MAIN_OPENMM_LIB} ${PTHREADS_LIB})
-SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES COMPILE_FLAGS "-DOPENMM_CPU_BUILDING_SHARED_LIBRARY")
+SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES COMPILE_FLAGS "-DOPENMM_CPU_BUILDING_SHARED_LIBRARY" LINK_FLAGS "${COMPILE_FLAGS}")
 INSTALL_TARGETS(/lib/plugins RUNTIME_DIRECTORY /lib/plugins ${SHARED_TARGET})
--- a/platforms/cpu/src/CpuGBSAOBCForce.cpp
+++ b/platforms/cpu/src/CpuGBSAOBCForce.cpp
+/* Portions copyright (c) 2006-2013 Stanford University and Simbios.
+ * Contributors: Pande Group
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject
+ * to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include "CpuGBSAOBCForce.h"
+#include "SimTKOpenMMRealType.h"
+#include "openmm/internal/SplineFitter.h"
+#include "openmm/internal/vectorize.h"
+#include <cmath>
+using namespace std;
+using namespace OpenMM;
+const int CpuGBSAOBCForce::NUM_TABLE_POINTS = 1025;
+class CpuGBSAOBCForce::ComputeTask : public ThreadPool::Task {
+public:
+    ComputeTask(CpuGBSAOBCForce& owner) : owner(owner) {
+    }
+    void execute(ThreadPool& threads, int threadIndex) {
+        owner.threadComputeForce(threads, threadIndex);
+    }
+    CpuGBSAOBCForce& owner;
+};
+CpuGBSAOBCForce::CpuGBSAOBCForce() : cutoff(false), periodic(false) {
+    logDX = 0.5/NUM_TABLE_POINTS;
+    logDXInv = 1.0f/logDX;
+    vector<double> x(NUM_TABLE_POINTS+1);
+    vector<double> y(NUM_TABLE_POINTS+1);
+    vector<double> deriv;
+    for (int i = 0; i < NUM_TABLE_POINTS+1; i++) {
+        x[i] = 0.5+i*0.5/NUM_TABLE_POINTS;
+        y[i] = log(x[i]);
+    }
+    SplineFitter::createNaturalSpline(x, y, deriv);
+    logTable.resize(4*NUM_TABLE_POINTS);
+    for (int i = 0; i < NUM_TABLE_POINTS; i++) {
+        logTable[4*i] = (float) y[i];
+        logTable[4*i+1] = (float) y[i+1];
+        logTable[4*i+2] = (float) (deriv[i]*logDX*logDX/6);
+        logTable[4*i+3] = (float) (deriv[i+1]*logDX*logDX/6);
+    }
+}
+void CpuGBSAOBCForce::setUseCutoff(float distance) {
+    cutoff = true;
+    cutoffDistance = distance;
+}
+void CpuGBSAOBCForce::setPeriodic(float* periodicBoxSize) {
+    periodic = true;
+    this->periodicBoxSize[0] = periodicBoxSize[0];
+    this->periodicBoxSize[1] = periodicBoxSize[1];
+    this->periodicBoxSize[2] = periodicBoxSize[2];
+}
+void CpuGBSAOBCForce::setSoluteDielectric(float dielectric) {
+    soluteDielectric = dielectric;
+}
+void CpuGBSAOBCForce::setSolventDielectric(float dielectric) {
+    solventDielectric = dielectric;
+}
+const std::vector<std::pair<float, float> >& CpuGBSAOBCForce::getParticleParameters() const {
+    return particleParams;
+}
+void CpuGBSAOBCForce::setParticleParameters(const std::vector<std::pair<float, float> >& params) {
+    particleParams = params;
+    bornRadii.resize(params.size()+3);
+    obcChain.resize(params.size()+3);
+}
+void CpuGBSAOBCForce::computeForce(const std::vector<float>& posq, vector<vector<float> >& threadForce, double* totalEnergy, ThreadPool& threads) {
+    // Record the parameters for the threads.
+    this->posq = &posq[0];
+    this->threadForce = &threadForce;
+    includeEnergy = (totalEnergy != NULL);
+    int numThreads = threads.getNumThreads();
+    threadEnergy.resize(numThreads);
+    threadBornForces.resize(numThreads);
+    for (int i = 0; i < numThreads; i++)
+        threadBornForces[i].resize(particleParams.size()+3);
+    // Signal the threads to start running and wait for them to finish.
+    ComputeTask task(*this);
+    threads.execute(task);
+    threads.waitForThreads(); // Compute Born radii
+    threads.resumeThreads();
+    threads.waitForThreads(); // Compute surface area term
+    threads.resumeThreads();
+    threads.waitForThreads(); // First loop
+    threads.resumeThreads();
+    threads.waitForThreads(); // Second loop
+    // Combine the energies from all the threads.
+    if (totalEnergy != NULL) {
+        double energy = 0;
+        for (int i = 0; i < numThreads; i++)
+            energy += threadEnergy[i];
+        *totalEnergy += (float) energy;
+    }
+}
+void CpuGBSAOBCForce::threadComputeForce(ThreadPool& threads, int threadIndex) {
+    int numParticles = particleParams.size();
+    int numThreads = threads.getNumThreads();
+    const float dielectricOffset = 0.009;
+    const float alphaObc = 1.0f;
+    const float betaObc = 0.8f;
+    const float gammaObc = 4.85f;
+    fvec4 boxSize(periodicBoxSize[0], periodicBoxSize[1], periodicBoxSize[2], 0);
+    fvec4 invBoxSize((1/periodicBoxSize[0]), (1/periodicBoxSize[1]), (1/periodicBoxSize[2]), 0);
+    int start = (threadIndex*numParticles)/numThreads;
+    int end = ((threadIndex+1)*numParticles)/numThreads;
+    // Calculate Born radii
+    for (int blockStart = start; blockStart < end; blockStart += 4) {
+        int numInBlock = min(4, end-blockStart);
+        ivec4 blockAtomIndex(blockStart, blockStart+1, blockStart+2, blockStart+3);
+        float atomRadius[4], atomx[4], atomy[4], atomz[4];
+        int blockMask[4] = {0, 0, 0, 0};
+        for (int i = 0; i < numInBlock; i++) {
+            int atomIndex = blockStart+i;
+            atomRadius[i] = particleParams[atomIndex].first;
+            atomx[i] = posq[4*atomIndex];
+            atomy[i] = posq[4*atomIndex+1];
+            atomz[i] = posq[4*atomIndex+2];
+            blockMask[i] = 0xFFFFFFFF;
+        }
+        fvec4 offsetRadiusI(atomRadius);
+        fvec4 radiusIInverse = 1.0f/offsetRadiusI;
+        fvec4 x(atomx);
+        fvec4 y(atomy);
+        fvec4 z(atomz);
+        ivec4 mask(blockMask);
+        float sum[4] = {0.0f, 0.0f, 0.0f, 0.0f};
+        for (int atomJ = 0; atomJ < numParticles; atomJ++) {
+            fvec4 posJ(posq+4*atomJ);
+            fvec4 dx, dy, dz, r2;
+            getDeltaR(posJ, x, y, z, dx, dy, dz, r2, periodic, boxSize, invBoxSize);
+            ivec4 include = mask & (blockAtomIndex != ivec4(atomJ));
+            if (cutoff)
+                include = include & (r2 < cutoffDistance*cutoffDistance);
+            if (!any(include))
+                continue;
+            fvec4 r = sqrt(r2);
+            float scaledRadiusJ = particleParams[atomJ].second;
+            float scaledRadiusJ2 = scaledRadiusJ*scaledRadiusJ;
+            fvec4 rScaledRadiusJ = r + scaledRadiusJ;
+            include = include & (offsetRadiusI < rScaledRadiusJ);
+            fvec4 l_ij = 1.0f/max(offsetRadiusI, abs(r-scaledRadiusJ));
+            fvec4 u_ij = 1.0f/rScaledRadiusJ;
+            fvec4 l_ij2 = l_ij*l_ij;
+            fvec4 u_ij2 = u_ij*u_ij;
+            fvec4 rInverse = 1.0f/r;
+            fvec4 r2Inverse = rInverse*rInverse;
+            fvec4 logRatio = fastLog(u_ij/l_ij);
+            fvec4 term = l_ij - u_ij + 0.25f*r*(u_ij2 - l_ij2) + (0.5f*rInverse*logRatio) + (0.25f*scaledRadiusJ*scaledRadiusJ*rInverse)*(l_ij2 - u_ij2);
+            for (int j = 0; j < 4; j++) {
+                if (include[j]) {
+                    sum[j] += term[j];
+                    if (offsetRadiusI[j] < scaledRadiusJ-r[j])
+                        sum[j] += 2.0f*(radiusIInverse[j]-l_ij[j]);
+                }
+            }
+        }
+        for (int i = 0; i < numInBlock; i++) {
+            int atomIndex = blockStart+i;
+            sum[i] *= 0.5f*atomRadius[i];
+            float sum2 = sum[i]*sum[i];
+            float sum3 = sum[i]*sum2;
+            float tanhSum = tanh(alphaObc*sum[i] - betaObc*sum2 + gammaObc*sum3);
+            float radiusI = atomRadius[i] + dielectricOffset;
+            bornRadii[atomIndex] = 1.0f/(1.0f/atomRadius[i] - tanhSum/radiusI);
+            obcChain[atomIndex] = atomRadius[i]*(alphaObc - 2.0f*betaObc*sum[i] + 3.0f*gammaObc*sum2);
+            obcChain[atomIndex] = (1.0f - tanhSum*tanhSum)*obcChain[atomIndex]/radiusI;
+        }
+    }
+    threads.syncThreads();
+    // Calculate ACE surface area term.
+    const float probeRadius = 0.14f;
+    const float surfaceAreaFactor = 28.3919551;
+    double energy = 0.0;
+    vector<float>& bornForces = threadBornForces[threadIndex];
+    for (int i = 0; i < numParticles; i++)
+        bornForces[i] = 0.0f;
+    for (int atomI = start; atomI < end; atomI++) {
+        if (bornRadii[atomI] > 0) {
+            float radiusI = particleParams[atomI].first + dielectricOffset;
+            float r = radiusI + probeRadius;
+            float ratio6 = powf(radiusI/bornRadii[atomI], 6.0f);
+            float saTerm = surfaceAreaFactor*r*r*ratio6;
+            energy += saTerm;
+            bornForces[atomI] = -6.0f*saTerm/bornRadii[atomI]; 
+        }
+        else
+            bornForces[atomI] = 0.0f;
+    }
+    threads.syncThreads();
+    // First loop of Born energy computation.
+    float* forces = &(*threadForce)[threadIndex][0];
+    float preFactor;
+    if (soluteDielectric != 0.0f && solventDielectric != 0.0f)
+        preFactor = ONE_4PI_EPS0*((1.0f/solventDielectric) - (1.0f/soluteDielectric));
+    else
+        preFactor = 0.0f;
+    for (int blockStart = start; blockStart < end; blockStart += 4) {
+        int numInBlock = min(4, end-blockStart);
+        ivec4 blockAtomIndex(blockStart, blockStart+1, blockStart+2, blockStart+3);
+        float atomCharge[4], atomx[4], atomy[4], atomz[4];
+        int blockMask[4] = {0, 0, 0, 0};
+        fvec4 blockAtomForce[4];
+        for (int i = 0; i < numInBlock; i++) {
+            int atomIndex = blockStart+i;
+            atomx[i] = posq[4*atomIndex];
+            atomy[i] = posq[4*atomIndex+1];
+            atomz[i] = posq[4*atomIndex+2];
+            atomCharge[i] = preFactor*posq[4*atomIndex+3];
+            blockMask[i] = 0xFFFFFFFF;
+            blockAtomForce[i] = 0.0f;
+        }
+        fvec4 radii(&bornRadii[blockStart]);
+        fvec4 x(atomx);
+        fvec4 y(atomy);
+        fvec4 z(atomz);
+        fvec4 partialChargeI(atomCharge);
+        ivec4 mask(blockMask);
+        for (int atomJ = blockStart; atomJ < numParticles; atomJ++) {
+            fvec4 posJ(posq+4*atomJ);
+            fvec4 dx, dy, dz, r2;
+            getDeltaR(posJ, x, y, z, dx, dy, dz, r2, periodic, boxSize, invBoxSize);
+            ivec4 include = mask & (blockAtomIndex <= ivec4(atomJ));
+            if (cutoff)
+                include = include & (r2 < cutoffDistance*cutoffDistance);
+            if (!any(include))
+                continue;
+            fvec4 r = sqrt(r2);
+            fvec4 alpha2_ij = radii*bornRadii[atomJ];
+            fvec4 D_ij = r2/(4.0f*alpha2_ij);
+            fvec4 expTerm(exp(-D_ij[0]), exp(-D_ij[1]), exp(-D_ij[2]), exp(-D_ij[3]));
+            fvec4 denominator2 = r2 + alpha2_ij*expTerm; 
+            fvec4 denominator = sqrt(denominator2);
+            fvec4 Gpol = (partialChargeI*posJ[3])/denominator; 
+            fvec4 dGpol_dr = -Gpol*(1.0f - 0.25f*expTerm)/denominator2;  
+            fvec4 dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f + D_ij)/denominator2;
+            fvec4 result[4] = {dx*dGpol_dr, dy*dGpol_dr, dz*dGpol_dr, 0.0f};
+            transpose(result[0], result[1], result[2], result[3]);
+            fvec4 atomForce(forces+4*atomJ);
+            for (int j = 0; j < 4; j++) {
+                if (include[j]) {
+                    float termEnergy = Gpol[j];
+                    if (blockStart+j != atomJ) {
+                        if (cutoff)
+                            termEnergy -= partialChargeI[j]*posJ[3]/cutoffDistance;
+                        bornForces[atomJ] += dGpol_dalpha2_ij[j]*radii[j];
+                        blockAtomForce[j] -= result[j];
+                        (fvec4(forces+4*atomJ)+result[j]).store(forces+4*atomJ);
+                    }
+                    else
+                        termEnergy *= 0.5f;
+                    energy += termEnergy;
+                    bornForces[blockStart+j] += dGpol_dalpha2_ij[j]*bornRadii[atomJ];
+                }
+            }
+        }
+        for (int i = 0; i < numInBlock; i++) {
+            int atomIndex = blockStart+i;
+            (fvec4(forces+4*atomIndex)+blockAtomForce[i]).store(forces+4*atomIndex);
+        }
+    }
+    threads.syncThreads();
+    // Second loop of Born energy computation.
+    for (int blockStart = start; blockStart < end; blockStart += 4) {
+        fvec4 bornForce(0.0f);
+        for (int i = 0; i < numThreads; i++)
+            bornForce += fvec4(&threadBornForces[i][blockStart]);
+        fvec4 radii(&bornRadii[blockStart]);
+        bornForce *= radii*radii*fvec4(&obcChain[blockStart]);
+        int numInBlock = min(4, end-blockStart);
+        ivec4 blockAtomIndex(blockStart, blockStart+1, blockStart+2, blockStart+3);
+        float atomRadius[4], atomx[4], atomy[4], atomz[4];
+        int blockMask[4] = {0, 0, 0, 0};
+        fvec4 blockAtomForce[4];
+        for (int i = 0; i < numInBlock; i++) {
+            int atomIndex = blockStart+i;
+            atomRadius[i] = particleParams[atomIndex].first;
+            atomx[i] = posq[4*atomIndex];
+            atomy[i] = posq[4*atomIndex+1];
+            atomz[i] = posq[4*atomIndex+2];
+            blockMask[i] = 0xFFFFFFFF;
+            blockAtomForce[i] = 0.0f;
+        }
+        fvec4 offsetRadiusI(atomRadius);
+        fvec4 x(atomx);
+        fvec4 y(atomy);
+        fvec4 z(atomz);
+        ivec4 mask(blockMask);
+        for (int atomJ = 0; atomJ < numParticles; atomJ++) {
+            fvec4 posJ(posq+4*atomJ);
+            fvec4 dx, dy, dz, r2;
+            getDeltaR(posJ, x, y, z, dx, dy, dz, r2, periodic, boxSize, invBoxSize);
+            ivec4 include = mask & (blockAtomIndex != ivec4(atomJ));
+            if (cutoff)
+                include = include & (r2 < cutoffDistance*cutoffDistance);
+            if (!any(include))
+                continue;
+            fvec4 r = sqrt(r2);
+            float scaledRadiusJ = particleParams[atomJ].second;
+            float scaledRadiusJ2 = scaledRadiusJ*scaledRadiusJ;
+            fvec4 rScaledRadiusJ = r + scaledRadiusJ;
+            include = include & (offsetRadiusI < rScaledRadiusJ);
+            fvec4 l_ij = 1.0f/max(offsetRadiusI, abs(r-scaledRadiusJ));
+            fvec4 u_ij = 1.0f/rScaledRadiusJ;
+            fvec4 l_ij2 = l_ij*l_ij;
+            fvec4 u_ij2 = u_ij*u_ij;
+            fvec4 rInverse = 1.0f/r;
+            fvec4 r2Inverse = rInverse*rInverse;
+            fvec4 logRatio = fastLog(u_ij/l_ij);
+            fvec4 t3 = 0.125f*(1.0f + scaledRadiusJ2*r2Inverse)*(l_ij2 - u_ij2) + 0.25f*logRatio*r2Inverse;
+            fvec4 de = bornForce*t3*rInverse;
+            fvec4 result[4] = {dx*de, dy*de, dz*de, 0.0f};
+            transpose(result[0], result[1], result[2], result[3]);
+            fvec4 atomForce(forces+4*atomJ);
+            for (int j = 0; j < 4; j++) {
+                if (include[j]) {
+                    blockAtomForce[j] += result[j];
+                    atomForce -= result[j];
+                }
+            }
+            atomForce.store(forces+4*atomJ);
+        }
+        for (int i = 0; i < numInBlock; i++) {
+            int atomIndex = blockStart+i;
+            (fvec4(forces+4*atomIndex)+blockAtomForce[i]).store(forces+4*atomIndex);
+        }
+    }
+    threadEnergy[threadIndex] = energy;
+}
+void CpuGBSAOBCForce::getDeltaR(const fvec4& posI, const fvec4& x, const fvec4& y, const fvec4& z, fvec4& dx, fvec4& dy, fvec4& dz, fvec4& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const {
+    dx = x-posI[0];
+    dy = y-posI[1];
+    dz = z-posI[2];
+    if (periodic) {
+        dx -= round(dx*invBoxSize[0])*boxSize[0];
+        dy -= round(dy*invBoxSize[1])*boxSize[1];
+        dz -= round(dz*invBoxSize[2])*boxSize[2];
+    }
+    r2 = dx*dx + dy*dy + dz*dz;
+}
+fvec4 CpuGBSAOBCForce::fastLog(fvec4 x) {
+    // Evaluate log(x) using a lookup table for speed.
+    float y[4];
+    fvec4 x1 = (x-0.5f)*logDXInv;
+    ivec4 index = floor(x1);
+    fvec4 coeff[4];
+    coeff[1] = x1-index;
+    coeff[0] = 1.0f-coeff[1];
+    coeff[2] = coeff[0]*coeff[0]*coeff[0]-coeff[0];
+    coeff[3] = coeff[1]*coeff[1]*coeff[1]-coeff[1];
+    transpose(coeff[0], coeff[1], coeff[2], coeff[3]);
+    static float maxdiff = 0.0f;
+    for (int i = 0; i < 4; i++) {
+        if (index[i] >= 0 && index[i] < NUM_TABLE_POINTS)
+            y[i] = dot4(coeff[i], fvec4(&logTable[4*index[i]]));
+        else
+            y[i] = logf(x[i]);
+    }
+    return fvec4(y);
+}
--- a/platforms/cpu/src/CpuKernelFactory.cpp
+++ b/platforms/cpu/src/CpuKernelFactory.cpp
@@ -38,8 +38,12 @@
 using namespace OpenMM;
 KernelImpl* CpuKernelFactory::createKernelImpl(std::string name, const Platform& platform, ContextImpl& context) const {
-    ReferencePlatform::PlatformData& data = *static_cast<ReferencePlatform::PlatformData*>(context.getPlatformData());
+    CpuPlatform::PlatformData& data = CpuPlatform::getPlatformData(context);
+    if (name == CalcForcesAndEnergyKernel::Name())
+        return new CpuCalcForcesAndEnergyKernel(name, platform, data, context);
    if (name == CalcNonbondedForceKernel::Name())
-        return new CpuCalcNonbondedForceKernel(name, platform);
+        return new CpuCalcNonbondedForceKernel(name, platform, data);
+    if (name == CalcGBSAOBCForceKernel::Name())
+        return new CpuCalcGBSAOBCForceKernel(name, platform, data);
    throw OpenMMException((std::string("Tried to create kernel with illegal kernel name '") + name + "'").c_str());
 }
--- a/platforms/cpu/src/CpuKernels.cpp
+++ b/platforms/cpu/src/CpuKernels.cpp
@@ -31,11 +31,14 @@
 #include "CpuKernels.h"
 #include "ReferenceBondForce.h"
+#include "ReferenceKernelFactory.h"
+#include "ReferenceKernels.h"
 #include "ReferenceLJCoulomb14.h"
 #include "openmm/Context.h"
 #include "openmm/OpenMMException.h"
 #include "openmm/internal/ContextImpl.h"
 #include "openmm/internal/NonbondedForceImpl.h"
+#include "openmm/internal/vectorize.h"
 #include "RealVec.h"
 using namespace OpenMM;
@@ -61,6 +64,67 @@ static RealVec& extractBoxSize(ContextImpl& context) {
    return *(RealVec*) data->periodicBoxSize;
 }
+CpuCalcForcesAndEnergyKernel::CpuCalcForcesAndEnergyKernel(std::string name, const Platform& platform, CpuPlatform::PlatformData& data, ContextImpl& context) :
+        CalcForcesAndEnergyKernel(name, platform), data(data) {
+    // Create a Reference platform version of this kernel.
+    ReferenceKernelFactory referenceFactory;
+    referenceKernel = Kernel(referenceFactory.createKernelImpl(name, platform, context));
+}
+void CpuCalcForcesAndEnergyKernel::initialize(const System& system) {
+    referenceKernel.getAs<ReferenceCalcForcesAndEnergyKernel>().initialize(system);
+}
+void CpuCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups) {
+    referenceKernel.getAs<ReferenceCalcForcesAndEnergyKernel>().beginComputation(context, includeForce, includeEnergy, groups);
+    // Convert the positions to single precision and apply periodic boundary conditions
+    vector<float>& posq = data.posq;
+    vector<RealVec>& posData = extractPositions(context);
+    RealVec boxSize = extractBoxSize(context);
+    float floatBoxSize[3] = {(float) boxSize[0], (float) boxSize[1], (float) boxSize[2]};
+    int numParticles = context.getSystem().getNumParticles();
+    if (data.isPeriodic)
+        for (int i = 0; i < numParticles; i++)
+            for (int j = 0; j < 3; j++) {
+                RealOpenMM x = posData[i][j];
+                double base = floor(x/boxSize[j])*boxSize[j];
+                posq[4*i+j] = (float) (x-base);
+            }
+    else
+        for (int i = 0; i < numParticles; i++) {
+            posq[4*i] = (float) posData[i][0];
+            posq[4*i+1] = (float) posData[i][1];
+            posq[4*i+2] = (float) posData[i][2];
+        }
+    // Clear the forces.
+    fvec4 zero(0.0f);
+    for (int i = 0; i < (int) data.threadForce.size(); i++)
+        for (int j = 0; j < numParticles; j++)
+            zero.store(&data.threadForce[i][j*4]);
+}
+double CpuCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups) {
+    // Sum the forces from all the threads.
+    int numParticles = context.getSystem().getNumParticles();
+    int numThreads = data.threads.getNumThreads();
+    vector<RealVec>& forceData = extractForces(context);
+    for (int i = 0; i < numParticles; i++) {
+        fvec4 f(0.0f);
+        for (int j = 0; j < numThreads; j++)
+            f += fvec4(&data.threadForce[j][4*i]);
+        forceData[i][0] += f[0];
+        forceData[i][1] += f[1];
+        forceData[i][2] += f[2];
+    }
+    return referenceKernel.getAs<ReferenceCalcForcesAndEnergyKernel>().finishComputation(context, includeForce, includeEnergy, groups);
+}
 class CpuCalcNonbondedForceKernel::PmeIO : public CalcPmeReciprocalForceKernel::IO {
 public:
    PmeIO(float* posq, float* force, int numParticles) : posq(posq), force(force), numParticles(numParticles) {
@@ -97,8 +161,6 @@ void CpuCalcNonbondedForceKernel::initialize(const System& system, const Nonbond
    // Identify which exceptions are 1-4 interactions.
    numParticles = force.getNumParticles();
-    posq.resize(4*numParticles, 0);
-    forces.resize(4*numParticles, 0);
    exclusions.resize(numParticles);
    vector<int> nb14s;
    for (int i = 0; i < force.getNumExceptions(); i++) {
@@ -125,7 +187,7 @@ void CpuCalcNonbondedForceKernel::initialize(const System& system, const Nonbond
    for (int i = 0; i < numParticles; ++i) {
        double charge, radius, depth;
        force.getParticleParameters(i, charge, radius, depth);
-        posq[4*i+3] = (float) charge;
+        data.posq[4*i+3] = (float) charge;
        particleParams[i] = make_pair((float) (0.5*radius), (float) (2.0*sqrt(depth)));
        sumSquaredCharges += charge*charge;
    }
@@ -173,6 +235,7 @@ void CpuCalcNonbondedForceKernel::initialize(const System& system, const Nonbond
    else
        dispersionCoefficient = 0.0;
    lastPositions.resize(numParticles, Vec3(1e10, 1e10, 1e10));
+    data.isPeriodic = (nonbondedMethod == CutoffPeriodic || nonbondedMethod == Ewald || nonbondedMethod == PME);
 }
 double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy, bool includeDirect, bool includeReciprocal) {
@@ -192,32 +255,14 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
            }
        }
    }
+    vector<float>& posq = data.posq;
    vector<RealVec>& posData = extractPositions(context);
    vector<RealVec>& forceData = extractForces(context);
    RealVec boxSize = extractBoxSize(context);
    float floatBoxSize[3] = {(float) boxSize[0], (float) boxSize[1], (float) boxSize[2]};
    double energy = ewaldSelfEnergy;
-    bool periodic = (nonbondedMethod == CutoffPeriodic);
    bool ewald  = (nonbondedMethod == Ewald);
    bool pme  = (nonbondedMethod == PME);
-    // Convert the positions to single precision.
-    if (periodic || ewald || pme)
-        for (int i = 0; i < numParticles; i++)
-            for (int j = 0; j < 3; j++) {
-                RealOpenMM x = posData[i][j];
-                double base = floor(x/boxSize[j])*boxSize[j];
-                posq[4*i+j] = (float) (x-base);
-            }
-    else
-        for (int i = 0; i < numParticles; i++) {
-            posq[4*i] = (float) posData[i][0];
-            posq[4*i+1] = (float) posData[i][1];
-            posq[4*i+2] = (float) posData[i][2];
-        }
-    for (int i = 0; i < 4*numParticles; i++)
-        forces[i] = 0.0f;
    if (nonbondedMethod != NoCutoff) {
        // Determine whether we need to recompute the neighbor list.
@@ -260,12 +305,12 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
                }
        }
        if (needRecompute) {
-            neighborList.computeNeighborList(numParticles, posq, exclusions, floatBoxSize, periodic || ewald || pme, nonbondedCutoff+padding, threads);
+            neighborList.computeNeighborList(numParticles, posq, exclusions, floatBoxSize, data.isPeriodic, nonbondedCutoff+padding, data.threads);
            lastPositions = posData;
        }
        nonbonded.setUseCutoff(nonbondedCutoff, neighborList, rfDielectric);
    }
-    if (periodic || ewald || pme) {
+    if (data.isPeriodic) {
        double minAllowedSize = 1.999999*nonbondedCutoff;
        if (boxSize[0] < minAllowedSize || boxSize[1] < minAllowedSize || boxSize[2] < minAllowedSize)
            throw OpenMMException("The periodic box size has decreased to less than twice the nonbonded cutoff.");
@@ -277,12 +322,12 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
        nonbonded.setUsePME(ewaldAlpha, gridSize);
    if (useSwitchingFunction)
        nonbonded.setUseSwitchingFunction(switchingDistance);
-    float nonbondedEnergy = 0;
+    double nonbondedEnergy = 0;
    if (includeDirect)
-        nonbonded.calculateDirectIxn(numParticles, &posq[0], posData, particleParams, exclusions, &forces[0], includeEnergy ? &nonbondedEnergy : NULL, threads);
+        nonbonded.calculateDirectIxn(numParticles, &posq[0], posData, particleParams, exclusions, data.threadForce, includeEnergy ? &nonbondedEnergy : NULL, data.threads);
    if (includeReciprocal) {
        if (useOptimizedPme) {
-            PmeIO io(&posq[0], &forces[0], numParticles);
+            PmeIO io(&posq[0], &data.threadForce[0][0], numParticles);
            Vec3 periodicBoxSize(boxSize[0], boxSize[1], boxSize[2]);
            optimizedPme.getAs<CalcPmeReciprocalForceKernel>().beginComputation(io, periodicBoxSize, includeEnergy);
            optimizedPme.getAs<CalcPmeReciprocalForceKernel>().finishComputation(io);
@@ -291,16 +336,11 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
            nonbonded.calculateReciprocalIxn(numParticles, &posq[0], posData, particleParams, exclusions, forceData, includeEnergy ? &nonbondedEnergy : NULL);
    }
    energy += nonbondedEnergy;
-    for (int i = 0; i < numParticles; i++) {
-        forceData[i][0] += forces[4*i];
-        forceData[i][1] += forces[4*i+1];
-        forceData[i][2] += forces[4*i+2];
-    }
    if (includeDirect) {
        ReferenceBondForce refBondForce;
        ReferenceLJCoulomb14 nonbonded14;
        refBondForce.calculateForce(num14, bonded14IndexArray, posData, bonded14ParamArray, forceData, includeEnergy ? &energy : NULL, nonbonded14);
-        if (periodic || ewald || pme)
+        if (data.isPeriodic)
            energy += dispersionCoefficient/(boxSize[0]*boxSize[1]*boxSize[2]);
    }
    return energy;
@@ -326,7 +366,7 @@ void CpuCalcNonbondedForceKernel::copyParametersToContext(ContextImpl& context,
    for (int i = 0; i < numParticles; ++i) {
        double charge, radius, depth;
        force.getParticleParameters(i, charge, radius, depth);
-        posq[4*i+3] = (float) charge;
+        data.posq[4*i+3] = (float) charge;
        particleParams[i] = make_pair((float) (0.5*radius), (float) (2.0*sqrt(depth)));
        sumSquaredCharges += charge*charge;
    }
@@ -351,3 +391,52 @@ void CpuCalcNonbondedForceKernel::copyParametersToContext(ContextImpl& context,
    if (force.getUseDispersionCorrection() && (method == NonbondedForce::CutoffPeriodic || method == NonbondedForce::Ewald || method == NonbondedForce::PME))
        dispersionCoefficient = NonbondedForceImpl::calcDispersionCorrection(context.getSystem(), force);
 }
+CpuCalcGBSAOBCForceKernel::~CpuCalcGBSAOBCForceKernel() {
+}
+void CpuCalcGBSAOBCForceKernel::initialize(const System& system, const GBSAOBCForce& force) {
+    int numParticles = system.getNumParticles();
+    particleParams.resize(numParticles);
+    for (int i = 0; i < numParticles; ++i) {
+        double charge, radius, scalingFactor;
+        force.getParticleParameters(i, charge, radius, scalingFactor);
+        data.posq[4*i+3] = (float) charge;
+        radius -= 0.009;
+        particleParams[i] = make_pair((float) radius, (float) (scalingFactor*radius));
+    }
+    obc.setParticleParameters(particleParams);
+    obc.setSolventDielectric((float) force.getSolventDielectric());
+    obc.setSoluteDielectric((float) force.getSoluteDielectric());
+    if (force.getNonbondedMethod() != GBSAOBCForce::NoCutoff)
+        obc.setUseCutoff((float) force.getCutoffDistance());
+    data.isPeriodic = (force.getNonbondedMethod() == GBSAOBCForce::CutoffPeriodic);
+}
+double CpuCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
+    if (data.isPeriodic) {
+        RealVec& boxSize = extractBoxSize(context);
+        float floatBoxSize[3] = {(float) boxSize[0], (float) boxSize[1], (float) boxSize[2]};
+        obc.setPeriodic(floatBoxSize);
+    }
+    double energy = 0.0;
+    obc.computeForce(data.posq, data.threadForce, includeEnergy ? &energy : NULL, data.threads);
+    return energy;
+}
+void CpuCalcGBSAOBCForceKernel::copyParametersToContext(ContextImpl& context, const GBSAOBCForce& force) {
+    int numParticles = force.getNumParticles();
+    if (numParticles != obc.getParticleParameters().size())
+        throw OpenMMException("updateParametersInContext: The number of particles has changed");
+    // Record the values.
+    for (int i = 0; i < numParticles; ++i) {
+        double charge, radius, scalingFactor;
+        force.getParticleParameters(i, charge, radius, scalingFactor);
+        data.posq[4*i+3] = (float) charge;
+        radius -= 0.009;
+        particleParams[i] = make_pair((float) radius, (float) (scalingFactor*radius));
+    }
+    obc.setParticleParameters(particleParams);
+}
--- a/platforms/cpu/src/CpuNeighborList.cpp
+++ b/platforms/cpu/src/CpuNeighborList.cpp
@@ -164,8 +164,12 @@ public:
        float refineCutoff = maxDistance-max(max(blockWidth[0], blockWidth[1]), blockWidth[2]);
        float refineCutoffSquared = refineCutoff*refineCutoff;
-        int dIndexX = min(nx/2, int((maxDistance+blockWidth[0])/voxelSizeX)+1); // How may voxels away do we have to look?
+        int dIndexX = int((maxDistance+blockWidth[0])/voxelSizeX)+1; // How may voxels away do we have to look?
-        int dIndexY = min(ny/2, int((maxDistance+blockWidth[1])/voxelSizeY)+1);
+        int dIndexY = int((maxDistance+blockWidth[1])/voxelSizeY)+1;
+        if (usePeriodic) {
+            dIndexX = min(nx/2, dIndexX);
+            dIndexY = min(ny/2, dIndexY);
+        }
        float centerPos[4];
        blockCenter.store(centerPos);
        VoxelIndex centerVoxelIndex = getVoxelIndex(centerPos);
@@ -202,7 +206,9 @@ public:
                float dz = maxDistance+blockWidth[2];
                dz = sqrtf(max(0.0f, dz*dz-dx*dx-dy*dy));
-                bool needPeriodic = (voxelIndex.x != x || voxelIndex.y != y || centerPos[2]-dz < 0.0f || centerPos[2]+dz > periodicBoxSize[2]);
+                bool needPeriodic = (centerPos[0]-blockWidth[0] < maxDistance || centerPos[0]+blockWidth[0] > periodicBoxSize[0]-maxDistance ||
+                                     centerPos[1]-blockWidth[1] < maxDistance || centerPos[1]+blockWidth[1] > periodicBoxSize[1]-maxDistance ||
+                                     centerPos[2]-dz < 0.0f || centerPos[2]+dz > periodicBoxSize[2]);
                int rangeStart[2];
                int rangeEnd[2];
                rangeStart[0] = findLowerBound(voxelIndex.x, voxelIndex.y, centerPos[2]-dz);

--- a/platforms/cpu/src/CpuNonbondedForce.cpp
+++ b/platforms/cpu/src/CpuNonbondedForce.cpp
@@ -22,7 +22,6 @@
 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
-#include <string.h>
 #include <complex>
 #include "SimTKOpenMMCommon.h"
@@ -179,7 +178,7 @@ void CpuNonbondedForce::tabulateEwaldScaleFactor() {
 void CpuNonbondedForce::calculateReciprocalIxn(int numberOfAtoms, float* posq, const vector<RealVec>& atomCoordinates,
                                             const vector<pair<float, float> >& atomParameters, const vector<set<int> >& exclusions,
-                                             vector<RealVec>& forces, float* totalEnergy) const {
+                                             vector<RealVec>& forces, double* totalEnergy) const {
    typedef std::complex<float> d_complex;
    static const float epsilon     =  1.0;
@@ -292,7 +291,7 @@ void CpuNonbondedForce::calculateReciprocalIxn(int numberOfAtoms, float* posq, c
 void CpuNonbondedForce::calculateDirectIxn(int numberOfAtoms, float* posq, const vector<RealVec>& atomCoordinates, const vector<pair<float, float> >& atomParameters,
-                const vector<set<int> >& exclusions, float* forces, float* totalEnergy, ThreadPool& threads) {
+                const vector<set<int> >& exclusions, vector<vector<float> >& threadForce, double* totalEnergy, ThreadPool& threads) {
    // Record the parameters for the threads.
    this->numberOfAtoms = numberOfAtoms;
@@ -300,9 +299,9 @@ void CpuNonbondedForce::calculateDirectIxn(int numberOfAtoms, float* posq, const
    this->atomCoordinates = &atomCoordinates[0];
    this->atomParameters = &atomParameters[0];
    this->exclusions = &exclusions[0];
+    this->threadForce = &threadForce;
    includeEnergy = (totalEnergy != NULL);
    threadEnergy.resize(threads.getNumThreads());
-    threadForce.resize(threads.getNumThreads());
    // Signal the threads to start running and wait for them to finish.
@@ -310,21 +309,15 @@ void CpuNonbondedForce::calculateDirectIxn(int numberOfAtoms, float* posq, const
    threads.execute(task);
    threads.waitForThreads();
-    // Combine the results from all the threads.
+    // Combine the energies from all the threads.
+    if (totalEnergy != NULL) {
        double directEnergy = 0;
        int numThreads = threads.getNumThreads();
        for (int i = 0; i < numThreads; i++)
            directEnergy += threadEnergy[i];
-    for (int i = 0; i < numberOfAtoms; i++) {
+        *totalEnergy += directEnergy;
-        fvec4 f(forces+4*i);
-        for (int j = 0; j < numThreads; j++)
-            f += fvec4(&threadForce[j][4*i]);
-        f.store(forces+4*i);
    }
-    if (totalEnergy != NULL)
-        *totalEnergy += (float) directEnergy;
 }
 void CpuNonbondedForce::threadComputeDirect(ThreadPool& threads, int threadIndex) {
@@ -333,10 +326,7 @@ void CpuNonbondedForce::threadComputeDirect(ThreadPool& threads, int threadIndex
    int numThreads = threads.getNumThreads();
    threadEnergy[threadIndex] = 0;
    double* energyPtr = (includeEnergy ? &threadEnergy[threadIndex] : NULL);
-    threadForce[threadIndex].resize(4*numberOfAtoms, 0.0f);
+    float* forces = &(*threadForce)[threadIndex][0];
-    float* forces = &threadForce[threadIndex][0];
-    for (int i = 0; i < 4*numberOfAtoms; i++)
-        forces[i] = 0.0f;
    fvec4 boxSize(periodicBoxSize[0], periodicBoxSize[1], periodicBoxSize[2], 0);
    fvec4 invBoxSize((1/periodicBoxSize[0]), (1/periodicBoxSize[1]), (1/periodicBoxSize[2]), 0);
    if (ewald || pme) {

--- a/platforms/cpu/src/CpuPlatform.cpp
+++ b/platforms/cpu/src/CpuPlatform.cpp
@@ -35,6 +35,7 @@
 #include "openmm/internal/hardware.h"
 using namespace OpenMM;
+using namespace std;
 extern "C" OPENMM_EXPORT_CPU void registerPlatforms() {
    // Only register this platform if the CPU supports SSE 4.1.
@@ -43,9 +44,13 @@ extern "C" OPENMM_EXPORT_CPU void registerPlatforms() {
        Platform::registerPlatform(new CpuPlatform());
 }
+map<ContextImpl*, CpuPlatform::PlatformData*> CpuPlatform::contextData;
 CpuPlatform::CpuPlatform() {
    CpuKernelFactory* factory = new CpuKernelFactory();
+    registerKernelFactory(CalcForcesAndEnergyKernel::Name(), factory);
    registerKernelFactory(CalcNonbondedForceKernel::Name(), factory);
+    registerKernelFactory(CalcGBSAOBCForceKernel::Name(), factory);
 }
 double CpuPlatform::getSpeed() const {
@@ -67,3 +72,28 @@ bool CpuPlatform::isProcessorSupported() {
    }
    return false;
 }
+void CpuPlatform::contextCreated(ContextImpl& context, const map<string, string>& properties) const {
+    ReferencePlatform::contextCreated(context, properties);
+    PlatformData* data = new PlatformData(context.getSystem().getNumParticles());
+    contextData[&context] = data;
+}
+void CpuPlatform::contextDestroyed(ContextImpl& context) const {
+    PlatformData* data = contextData[&context];
+    delete data;
+    contextData.erase(&context);
+}
+CpuPlatform::PlatformData& CpuPlatform::getPlatformData(ContextImpl& context) {
+    return *contextData[&context];
+}
+CpuPlatform::PlatformData::PlatformData(int numParticles) {
+    posq.resize(4*numParticles);
+    int numThreads = threads.getNumThreads();
+    threadForce.resize(numThreads);
+    for (int i = 0; i < numThreads; i++)
+        threadForce[i].resize(4*numParticles);
+    isPeriodic = false;
+}
--- a/platforms/cpu/tests/CMakeLists.txt
+++ b/platforms/cpu/tests/CMakeLists.txt
@@ -11,6 +11,7 @@ IF( INCLUDE_SERIALIZATION )
    INCLUDE_DIRECTORIES(${OPENMM_DIR}/serialization/include)
    SET( SHARED_OPENMM_SERIALIZATION "OpenMMSerialization" )
 ENDIF( INCLUDE_SERIALIZATION )
+GET_PROPERTY(COMPILE_FLAGS GLOBAL PROPERTY COMPILE_FLAGS)
 # Automatically create tests using files named "Test*.cpp"
 FILE(GLOB TEST_PROGS "*Test*.cpp")
@@ -20,6 +21,7 @@ FOREACH(TEST_PROG ${TEST_PROGS})
    # Link with shared library
    ADD_EXECUTABLE(${TEST_ROOT} ${TEST_PROG})
    TARGET_LINK_LIBRARIES(${TEST_ROOT} ${SHARED_TARGET})
+    SET_TARGET_PROPERTIES(${TEST_ROOT} PROPERTIES COMPILE_FLAGS "${COMPILE_FLAGS}" LINK_FLAGS "${COMPILE_FLAGS}")
    ADD_TEST(${TEST_ROOT} ${EXECUTABLE_OUTPUT_PATH}/${TEST_ROOT} single)
 ENDFOREACH(TEST_PROG ${TEST_PROGS})
--- a/platforms/cpu/tests/TestCpuGBSAOBCForce.cpp
+++ b/platforms/cpu/tests/TestCpuGBSAOBCForce.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2008-2013 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+/**
+ * This tests the CPU implementation of GBSAOBCForce.
+ */
+#include "openmm/internal/AssertionUtilities.h"
+#include "openmm/Context.h"
+#include "CpuPlatform.h"
+#include "openmm/GBSAOBCForce.h"
+#include "openmm/System.h"
+#include "openmm/LangevinIntegrator.h"
+#include "openmm/NonbondedForce.h"
+#include "SimTKOpenMMRealType.h"
+#include "sfmt/SFMT.h"
+#include <iostream>
+#include <vector>
+using namespace OpenMM;
+using namespace std;
+const double TOL = 1e-5;
+void testSingleParticle() {
+    CpuPlatform platform;
+    System system;
+    system.addParticle(2.0);
+    LangevinIntegrator integrator(0, 0.1, 0.01);
+    GBSAOBCForce* forceField = new GBSAOBCForce();
+    forceField->addParticle(0.5, 0.15, 1);
+    system.addForce(forceField);
+    Context context(system, integrator, platform);
+    vector<Vec3> positions(1);
+    positions[0] = Vec3(0, 0, 0);
+    context.setPositions(positions);
+    State state = context.getState(State::Energy);
+    double bornRadius = 0.15-0.009; // dielectric offset
+    double eps0 = EPSILON0;
+    double bornEnergy = (-0.5*0.5/(8*PI_M*eps0))*(1.0/forceField->getSoluteDielectric()-1.0/forceField->getSolventDielectric())/bornRadius;
+    double extendedRadius = 0.15+0.14; // probe radius
+    double nonpolarEnergy = CAL2JOULE*PI_M*0.0216*(10*extendedRadius)*(10*extendedRadius)*std::pow(0.15/bornRadius, 6.0); // Where did this formula come from?  Just copied it from CpuImplicitSolvent.cpp
+    ASSERT_EQUAL_TOL((bornEnergy+nonpolarEnergy), state.getPotentialEnergy(), 0.01);
+    // Change the parameters and see if it is still correct.
+    forceField->setParticleParameters(0, 0.4, 0.25, 1);
+    forceField->updateParametersInContext(context);
+    state = context.getState(State::Energy);
+    bornRadius = 0.25-0.009; // dielectric offset
+    bornEnergy = (-0.4*0.4/(8*PI_M*eps0))*(1.0/forceField->getSoluteDielectric()-1.0/forceField->getSolventDielectric())/bornRadius;
+    extendedRadius = bornRadius+0.14;
+    nonpolarEnergy = CAL2JOULE*PI_M*0.0216*(10*extendedRadius)*(10*extendedRadius)*std::pow(0.25/bornRadius, 6.0);
+    ASSERT_EQUAL_TOL((bornEnergy+nonpolarEnergy), state.getPotentialEnergy(), 0.01);
+}
+void testCutoffAndPeriodic() {
+    CpuPlatform platform;
+    System system;
+    system.addParticle(1.0);
+    system.addParticle(1.0);
+    LangevinIntegrator integrator(0, 0.1, 0.01);
+    GBSAOBCForce* gbsa = new GBSAOBCForce();
+    NonbondedForce* nonbonded = new NonbondedForce();
+    gbsa->addParticle(-1, 0.15, 1);
+    nonbonded->addParticle(-1, 1, 0);
+    gbsa->addParticle(1, 0.15, 1);
+    nonbonded->addParticle(1, 1, 0);
+    const double cutoffDistance = 3.0;
+    const double boxSize = 10.0;
+    nonbonded->setCutoffDistance(cutoffDistance);
+    gbsa->setCutoffDistance(cutoffDistance);
+    system.setDefaultPeriodicBoxVectors(Vec3(boxSize, 0, 0), Vec3(0, boxSize, 0), Vec3(0, 0, boxSize));
+    system.addForce(gbsa);
+    system.addForce(nonbonded);
+    vector<Vec3> positions(2);
+    positions[0] = Vec3(0, 0, 0);
+    positions[1] = Vec3(2, 0, 0);
+    // Calculate the forces for both cutoff and periodic with two different atom positions.
+    nonbonded->setNonbondedMethod(NonbondedForce::CutoffNonPeriodic);
+    gbsa->setNonbondedMethod(GBSAOBCForce::CutoffNonPeriodic);
+    Context context(system, integrator, platform);
+    context.setPositions(positions);
+    State state1 = context.getState(State::Forces);
+    nonbonded->setNonbondedMethod(NonbondedForce::CutoffPeriodic);
+    gbsa->setNonbondedMethod(GBSAOBCForce::CutoffPeriodic);
+    context.reinitialize();
+    context.setPositions(positions);
+    State state2 = context.getState(State::Forces);
+    positions[1][0]+= boxSize;
+    nonbonded->setNonbondedMethod(NonbondedForce::CutoffNonPeriodic);
+    gbsa->setNonbondedMethod(GBSAOBCForce::CutoffNonPeriodic);
+    context.reinitialize();
+    context.setPositions(positions);
+    State state3 = context.getState(State::Forces);
+    nonbonded->setNonbondedMethod(NonbondedForce::CutoffPeriodic);
+    gbsa->setNonbondedMethod(GBSAOBCForce::CutoffPeriodic);
+    context.reinitialize();
+    context.setPositions(positions);
+    State state4 = context.getState(State::Forces);
+    // All forces should be identical, exception state3 which should be zero.
+    ASSERT_EQUAL_VEC(state1.getForces()[0], state2.getForces()[0], 0.01);
+    ASSERT_EQUAL_VEC(state1.getForces()[1], state2.getForces()[1], 0.01);
+    ASSERT_EQUAL_VEC(state1.getForces()[0], state4.getForces()[0], 0.01);
+    ASSERT_EQUAL_VEC(state1.getForces()[1], state4.getForces()[1], 0.01);
+    ASSERT_EQUAL_VEC(state3.getForces()[0], Vec3(0, 0, 0), 0.01);
+    ASSERT_EQUAL_VEC(state3.getForces()[1], Vec3(0, 0, 0), 0.01);
+}
+void testForce(int numParticles, NonbondedForce::NonbondedMethod method, GBSAOBCForce::NonbondedMethod method2) {
+    CpuPlatform platform;
+    ReferencePlatform reference;
+    System system;
+    GBSAOBCForce* gbsa = new GBSAOBCForce();
+    NonbondedForce* nonbonded = new NonbondedForce();
+    for (int i = 0; i < numParticles; ++i) {
+        system.addParticle(1.0);
+        double charge = i%2 == 0 ? -1 : 1;
+        gbsa->addParticle(charge, 0.15, 1);
+        nonbonded->addParticle(charge, 1, 0);
+    }
+    nonbonded->setNonbondedMethod(method);
+    gbsa->setNonbondedMethod(method2);
+    nonbonded->setCutoffDistance(3.0);
+    gbsa->setCutoffDistance(3.0);
+    int grid = (int) floor(0.5+pow(numParticles, 1.0/3.0));
+    if (method == NonbondedForce::CutoffPeriodic) {
+        double boxSize = (grid+1)*1.1;
+        system.setDefaultPeriodicBoxVectors(Vec3(boxSize, 0, 0), Vec3(0, boxSize, 0), Vec3(0, 0, boxSize));
+    }
+    system.addForce(gbsa);
+    system.addForce(nonbonded);
+    LangevinIntegrator integrator1(0, 0.1, 0.01);
+    LangevinIntegrator integrator2(0, 0.1, 0.01);
+    Context context(system, integrator1, platform);
+    Context refContext(system, integrator2, reference);
+    // Set random (but uniformly distributed) positions for all the particles.
+    vector<Vec3> positions(numParticles);
+    OpenMM_SFMT::SFMT sfmt;
+    init_gen_rand(0, sfmt);
+    for (int i = 0; i < grid; i++)
+        for (int j = 0; j < grid; j++)
+            for (int k = 0; k < grid; k++)
+                positions[i*grid*grid+j*grid+k] = Vec3(i*1.1, j*1.1, k*1.1);
+    for (int i = 0; i < numParticles; ++i)
+        positions[i] = positions[i] + Vec3(0.5*genrand_real2(sfmt), 0.5*genrand_real2(sfmt), 0.5*genrand_real2(sfmt));
+    context.setPositions(positions);
+    refContext.setPositions(positions);
+    State state = context.getState(State::Forces | State::Energy);
+    State refState = refContext.getState(State::Forces | State::Energy);
+    // Make sure the CPU and Reference platforms agree.
+    double norm = 0.0;
+    double diff = 0.0;
+    for (int i = 0; i < numParticles; ++i) {
+        Vec3 f = state.getForces()[i];
+        norm += f[0]*f[0] + f[1]*f[1] + f[2]*f[2];
+        Vec3 delta = f-refState.getForces()[i];
+        diff += delta[0]*delta[0] + delta[1]*delta[1] + delta[2]*delta[2];
+    }
+    norm = std::sqrt(norm);
+    diff = std::sqrt(diff);
+    ASSERT_EQUAL_TOL(0.0, diff, 0.001*norm);
+    ASSERT_EQUAL_TOL(state.getPotentialEnergy(), refState.getPotentialEnergy(), 1e-3);
+    // Take a small step in the direction of the energy gradient and see whether the potential energy changes by the expected amount.
+    // (This doesn't work with cutoffs, since the energy changes discontinuously at the cutoff distance.)
+    if (method == NonbondedForce::NoCutoff)
+    {
+        const double delta = 0.3;
+        double step = 0.5*delta/norm;
+        vector<Vec3> positions2(numParticles), positions3(numParticles);
+        for (int i = 0; i < numParticles; ++i) {
+            Vec3 p = positions[i];
+            Vec3 f = state.getForces()[i];
+            positions2[i] = Vec3(p[0]-f[0]*step, p[1]-f[1]*step, p[2]-f[2]*step);
+            positions3[i] = Vec3(p[0]+f[0]*step, p[1]+f[1]*step, p[2]+f[2]*step);
+        }
+        context.setPositions(positions2);
+        State state2 = context.getState(State::Energy);
+        context.setPositions(positions3);
+        State state3 = context.getState(State::Energy);
+        ASSERT_EQUAL_TOL(norm, (state2.getPotentialEnergy()-state3.getPotentialEnergy())/delta, 1e-2)
+    }
+}
+int main() {
+    try {
+        testSingleParticle();
+        testCutoffAndPeriodic();
+        for (int i = 5; i < 11; i++) {
+            testForce(i*i*i, NonbondedForce::NoCutoff, GBSAOBCForce::NoCutoff);
+            testForce(i*i*i, NonbondedForce::CutoffNonPeriodic, GBSAOBCForce::CutoffNonPeriodic);
+            testForce(i*i*i, NonbondedForce::CutoffPeriodic, GBSAOBCForce::CutoffPeriodic);
+        }
+    }
+    catch(const exception& e) {
+        cout << "exception: " << e.what() << endl;
+        return 1;
+    }
+    cout << "Done" << endl;
+    return 0;
+}