Merge pull request #1547 from peastman/paramderivs

Energy derivatives with respect to global parameters

Merge pull request #1547 from peastman/paramderivs
Energy derivatives with respect to global parameters
1f7866ad · peastman · GitHub · 37787af9 · 7851bad8 · 1f7866ad
Commit 1f7866ad authored Jul 29, 2016 by peastman Committed by GitHub Jul 29, 2016
20 changed files
--- a/platforms/cuda/src/CudaContext.cpp
+++ b/platforms/cuda/src/CudaContext.cpp
@@ -76,7 +76,7 @@ bool CudaContext::hasInitializedCuda = false;
 CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlockingSync, const string& precision, const string& compiler,
        const string& tempDir, const std::string& hostCompiler, CudaPlatform::PlatformData& platformData) : system(system), currentStream(0),
        time(0.0), platformData(platformData), stepCount(0), computeForceCount(0), stepsSinceReorder(99999), contextIsValid(false), atomsWereReordered(false), hasCompilerKernel(false),
-        pinnedBuffer(NULL), posq(NULL), posqCorrection(NULL), velm(NULL), force(NULL), energyBuffer(NULL), atomIndexDevice(NULL), integration(NULL), expression(NULL), bonded(NULL), nonbonded(NULL), thread(NULL) {
+        pinnedBuffer(NULL), posq(NULL), posqCorrection(NULL), velm(NULL), force(NULL), energyBuffer(NULL), energyParamDerivBuffer(NULL), atomIndexDevice(NULL), integration(NULL), expression(NULL), bonded(NULL), nonbonded(NULL), thread(NULL) {
    this->compiler = "\""+compiler+"\"";
    if (platformData.context != NULL) {
        try {
@@ -339,6 +339,8 @@ CudaContext::~CudaContext() {
        delete force;
    if (energyBuffer != NULL)
        delete energyBuffer;
+    if (energyParamDerivBuffer != NULL)
+        delete energyParamDerivBuffer;
    if (atomIndexDevice != NULL)
        delete atomIndexDevice;
    if (integration != NULL)
@@ -390,6 +392,14 @@ void CudaContext::initialize() {
    force = CudaArray::create<long long>(*this, paddedNumAtoms*3, "force");
    addAutoclearBuffer(force->getDevicePointer(), force->getSize()*force->getElementSize());
    addAutoclearBuffer(energyBuffer->getDevicePointer(), energyBuffer->getSize()*energyBuffer->getElementSize());
+    int numEnergyParamDerivs = energyParamDerivNames.size();
+    if (numEnergyParamDerivs > 0) {
+        if (useDoublePrecision || useMixedPrecision)
+            energyParamDerivBuffer = CudaArray::create<double>(*this, numEnergyParamDerivs*numEnergyBuffers, "energyParamDerivBuffer");
+        else
+            energyParamDerivBuffer = CudaArray::create<float>(*this, numEnergyParamDerivs*numEnergyBuffers, "energyParamDerivBuffer");
+        addAutoclearBuffer(*energyParamDerivBuffer);
+    }
    atomIndexDevice = CudaArray::create<int>(*this, paddedNumAtoms, "atomIndex");
    atomIndex.resize(paddedNumAtoms);
    for (int i = 0; i < paddedNumAtoms; ++i)
@@ -1311,6 +1321,15 @@ void CudaContext::addPostComputation(ForcePostComputation* computation) {
    postComputations.push_back(computation);
 }

+void CudaContext::addEnergyParameterDerivative(const string& param) {
+    // See if this parameter has already been registered.
+    
+    for (int i = 0; i < energyParamDerivNames.size(); i++)
+        if (param == energyParamDerivNames[i])
+            return;
+    energyParamDerivNames.push_back(param);
+}
+
 struct CudaContext::WorkThread::ThreadData {
    ThreadData(std::queue<CudaContext::WorkTask*>& tasks, bool& waiting,  bool& finished,
            pthread_mutex_t& queueLock, pthread_cond_t& waitForTaskCondition, pthread_cond_t& queueEmptyCondition) :

--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
--- a/platforms/cuda/src/CudaNonbondedUtilities.cpp
+++ b/platforms/cuda/src/CudaNonbondedUtilities.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2015 Stanford University and the Authors.      *
+ * Portions copyright (c) 2009-2016 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -146,6 +146,19 @@ void CudaNonbondedUtilities::addArgument(const ParameterInfo& parameter) {
    arguments.push_back(parameter);
 }

+string CudaNonbondedUtilities::addEnergyParameterDerivative(const string& param) {
+    // See if the parameter has already been added.
+    
+    int index;
+    for (index = 0; index < energyParameterDerivatives.size(); index++)
+        if (param == energyParameterDerivatives[index])
+            break;
+    if (index == energyParameterDerivatives.size())
+        energyParameterDerivatives.push_back(param);
+    context.addEnergyParameterDerivative(param);
+    return string("energyParamDeriv")+context.intToString(index);
+}
+
 void CudaNonbondedUtilities::requestExclusions(const vector<vector<int> >& exclusionList) {
    if (anyExclusions) {
        bool sameExclusions = (exclusionList.size() == atomExclusions.size());
@@ -308,6 +321,8 @@ void CudaNonbondedUtilities::initialize(const System& system) {
        forceArgs.push_back(&parameters[i].getMemory());
    for (int i = 0; i < (int) arguments.size(); i++)
        forceArgs.push_back(&arguments[i].getMemory());
+    if (energyParameterDerivatives.size() > 0)
+        forceArgs.push_back(&context.getEnergyParamDerivBuffer().getDevicePointer());
    if (useCutoff) {
        findBlockBoundsArgs.push_back(&numAtoms);
        findBlockBoundsArgs.push_back(context.getPeriodicBoxSizePointer());
@@ -515,6 +530,8 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
        args << "* __restrict__ ";
        args << arguments[i].getName();
    }
+    if (energyParameterDerivatives.size() > 0)
+        args << ", mixed* __restrict__ energyParamDerivs";
    replacements["PARAMETER_ARGUMENTS"] = args.str();

    stringstream load1;
@@ -623,6 +640,18 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
        }
    }
    replacements["LOAD_ATOM2_PARAMETERS"] = load2j.str();
+    stringstream initDerivs;
+    for (int i = 0; i < energyParameterDerivatives.size(); i++)
+        initDerivs<<"mixed energyParamDeriv"<<i<<" = 0;\n";
+    replacements["INIT_DERIVATIVES"] = initDerivs.str();
+    stringstream saveDerivs;
+    const vector<string>& allParamDerivNames = context.getEnergyParamDerivNames();
+    int numDerivs = allParamDerivNames.size();
+    for (int i = 0; i < energyParameterDerivatives.size(); i++)
+        for (int index = 0; index < numDerivs; index++)
+            if (allParamDerivNames[index] == energyParameterDerivatives[i])
+                saveDerivs<<"energyParamDerivs[(blockIdx.x*blockDim.x+threadIdx.x)*"<<numDerivs<<"+"<<index<<"] += energyParamDeriv"<<i<<";\n";
+    replacements["SAVE_DERIVATIVES"] = saveDerivs.str();

    stringstream shuffleWarpData;
    if(useShuffle) {

--- a/platforms/cuda/src/kernels/customCentroidBond.cu
+++ b/platforms/cuda/src/kernels/customCentroidBond.cu
@@ -111,10 +111,12 @@ extern "C" __global__ void computeGroupForces(unsigned long long* __restrict__ g
        const int* __restrict__ bondGroups, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
        EXTRA_ARGS) {
    mixed energy = 0;
+    INIT_PARAM_DERIVS
    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_BONDS; index += blockDim.x*gridDim.x) {
        COMPUTE_FORCE
    }
    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
+    SAVE_PARAM_DERIVS
 }

 /**

--- a/platforms/cuda/src/kernels/customGBEnergyN2.cu
+++ b/platforms/cuda/src/kernels/customGBEnergyN2.cu
@@ -28,6 +28,7 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
    const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
    const unsigned int tbx = threadIdx.x - tgx;
    mixed energy = 0;
+    INIT_PARAM_DERIVS
    __shared__ AtomData localData[THREAD_BLOCK_SIZE];

    // First loop: process tiles that contain exclusions.
@@ -69,6 +70,7 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
                    atom2 = y*TILE_SIZE+j;
                    real dEdR = 0;
                    real tempEnergy = 0;
+                    const real interactionScale = 0.5f;
 #ifdef USE_EXCLUSIONS
                    bool isExcluded = !(excl & 0x1);
 #endif
@@ -120,6 +122,7 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
                    atom2 = y*TILE_SIZE+tj;
                    real dEdR = 0;
                    real tempEnergy = 0;
+                    const real interactionScale = 1;
 #ifdef USE_EXCLUSIONS
                    bool isExcluded = !(excl & 0x1);
 #endif
@@ -266,6 +269,7 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
                        atom2 = atomIndices[tbx+tj];
                        real dEdR = 0;
                        real tempEnergy = 0;
+                        const real interactionScale = 1;
                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
                            COMPUTE_INTERACTION
                            dEdR /= -r;
@@ -309,6 +313,7 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
                        atom2 = atomIndices[tbx+tj];
                        real dEdR = 0;
                        real tempEnergy = 0;
+                        const real interactionScale = 1;
                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
                            COMPUTE_INTERACTION
                            dEdR /= -r;
@@ -353,4 +358,5 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
        pos++;
    }
    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
+    SAVE_PARAM_DERIVS
 }
--- a/platforms/cuda/src/kernels/customGBEnergyPerParticle.cu
+++ b/platforms/cuda/src/kernels/customGBEnergyPerParticle.cu
@@ -5,6 +5,7 @@
 extern "C" __global__ void computePerParticleEnergy(long long* __restrict__ forceBuffers, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq
        PARAMETER_ARGUMENTS) {
    mixed energy = 0;
+    INIT_PARAM_DERIVS
    for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
        // Load the derivatives

@@ -17,4 +18,5 @@ extern "C" __global__ void computePerParticleEnergy(long long* __restrict__ forc
        COMPUTE_ENERGY
    }
    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
+    SAVE_PARAM_DERIVS
 }
--- a/platforms/cuda/src/kernels/customGBGradientChainRule.cu
+++ b/platforms/cuda/src/kernels/customGBGradientChainRule.cu
@@ -4,6 +4,7 @@

 extern "C" __global__ void computeGradientChainRuleTerms(long long* __restrict__ forceBuffers, const real4* __restrict__ posq
        PARAMETER_ARGUMENTS) {
+    INIT_PARAM_DERIVS
    const real scale = RECIP((real) 0x100000000);
    for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
        real4 pos = posq[index];
@@ -13,4 +14,5 @@ extern "C" __global__ void computeGradientChainRuleTerms(long long* __restrict__
        forceBuffers[index+PADDED_NUM_ATOMS] = (long long) (force.y*0x100000000);
        forceBuffers[index+PADDED_NUM_ATOMS*2] = (long long) (force.z*0x100000000);
    }
+    SAVE_PARAM_DERIVS
 }
--- a/platforms/cuda/src/kernels/customGBValueN2.cu
+++ b/platforms/cuda/src/kernels/customGBValueN2.cu
@@ -73,6 +73,7 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
                        COMPUTE_VALUE
                    }
                    value += tempValue1;
+                    ADD_TEMP_DERIVS1
 #ifdef USE_CUTOFF
                }
 #endif
@@ -121,6 +122,8 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
                    }
                    value += tempValue1;
                    localData[tbx+tj].value += tempValue2;
+                    ADD_TEMP_DERIVS1
+                    ADD_TEMP_DERIVS2
 #ifdef USE_CUTOFF
                }
 #endif
@@ -133,11 +136,13 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const

        // Write results.

-        unsigned int offset = x*TILE_SIZE + tgx;
-        atomicAdd(&global_value[offset], static_cast<unsigned long long>((long long) (value*0x100000000)));
+        unsigned int offset1 = x*TILE_SIZE + tgx;
+        atomicAdd(&global_value[offset1], static_cast<unsigned long long>((long long) (value*0x100000000)));
+        STORE_PARAM_DERIVS1
        if (x != y) {
-            offset = y*TILE_SIZE + tgx;
-            atomicAdd(&global_value[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].value*0x100000000)));
+            unsigned int offset2 = y*TILE_SIZE + tgx;
+            atomicAdd(&global_value[offset2], static_cast<unsigned long long>((long long) (localData[threadIdx.x].value*0x100000000)));
+            STORE_PARAM_DERIVS2
        }
    }

@@ -244,6 +249,8 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
                        }
                        value += tempValue1;
                        localData[tbx+tj].value += tempValue2;
+                        ADD_TEMP_DERIVS1
+                        ADD_TEMP_DERIVS2
                    }
                    tj = (tj + 1) & (TILE_SIZE - 1);
                }
@@ -276,6 +283,8 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
                        }
                        value += tempValue1;
                        localData[tbx+tj].value += tempValue2;
+                        ADD_TEMP_DERIVS1
+                        ADD_TEMP_DERIVS2
 #ifdef USE_CUTOFF
                    }
 #endif
@@ -285,14 +294,19 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
        
            // Write results.

-            atomicAdd(&global_value[atom1], static_cast<unsigned long long>((long long) (value*0x100000000)));
+            unsigned int offset1 = atom1;
+            atomicAdd(&global_value[offset1], static_cast<unsigned long long>((long long) (value*0x100000000)));
+            STORE_PARAM_DERIVS1
 #ifdef USE_CUTOFF
            unsigned int atom2 = atomIndices[threadIdx.x];
 #else
            unsigned int atom2 = y*TILE_SIZE + tgx;
 #endif
-            if (atom2 < PADDED_NUM_ATOMS)
-                atomicAdd(&global_value[atom2], static_cast<unsigned long long>((long long) (localData[threadIdx.x].value*0x100000000)));
+            if (atom2 < PADDED_NUM_ATOMS) {
+                unsigned int offset2 = atom2;
+                atomicAdd(&global_value[offset2], static_cast<unsigned long long>((long long) (localData[threadIdx.x].value*0x100000000)));
+                STORE_PARAM_DERIVS2
+            }
        }
        pos++;
    }

--- a/platforms/cuda/src/kernels/customGBValuePerParticle.cu
+++ b/platforms/cuda/src/kernels/customGBValuePerParticle.cu
@@ -8,6 +8,7 @@ extern "C" __global__ void computePerParticleValues(real4* posq, long long* valu
        // Load the pairwise value

        real sum = valueBuffers[index]/(real) 0x100000000;
+        REDUCE_PARAM0_DERIV
        
        // Now calculate other values


--- a/platforms/cuda/src/kernels/customIntegratorPerDof.cu
+++ b/platforms/cuda/src/kernels/customIntegratorPerDof.cu
@@ -33,7 +33,8 @@ inline __device__ mixed4 convertFromDouble4(double4 a) {

 extern "C" __global__ void computePerDof(real4* __restrict__ posq, real4* __restrict__ posqCorrection, mixed4* __restrict__ posDelta,
        mixed4* __restrict__ velm, const long long* __restrict__ force, const mixed2* __restrict__ dt, const mixed* __restrict__ globals,
-        mixed* __restrict__ sum, const float4* __restrict__ gaussianValues, unsigned int gaussianBaseIndex, const float4* __restrict__ uniformValues, const real energy
+        mixed* __restrict__ sum, const float4* __restrict__ gaussianValues, unsigned int gaussianBaseIndex, const float4* __restrict__ uniformValues,
+        const real energy, mixed* __restrict__ energyParamDerivs
        PARAMETER_ARGUMENTS) {
    mixed stepSize = dt[0].y;
    int index = blockIdx.x*blockDim.x+threadIdx.x;

--- a/platforms/cuda/src/kernels/customNonbonded.cu
+++ b/platforms/cuda/src/kernels/customNonbonded.cu
@@ -4,15 +4,18 @@ if (!isExcluded && r2 < CUTOFF_SQUARED) {
 if (!isExcluded) {
 #endif
    real tempForce = 0;
-    COMPUTE_FORCE
+    real switchValue = 1, switchDeriv = 0;
 #if USE_SWITCH
    if (r > SWITCH_CUTOFF) {
        real x = r-SWITCH_CUTOFF;
-        real switchValue = 1+x*x*x*(SWITCH_C3+x*(SWITCH_C4+x*SWITCH_C5));
-        real switchDeriv = x*x*(3*SWITCH_C3+x*(4*SWITCH_C4+x*5*SWITCH_C5));
-        tempForce = tempForce*switchValue - tempEnergy*switchDeriv;
-        tempEnergy *= switchValue;
+        switchValue = 1+x*x*x*(SWITCH_C3+x*(SWITCH_C4+x*SWITCH_C5));
+        switchDeriv = x*x*(3*SWITCH_C3+x*(4*SWITCH_C4+x*5*SWITCH_C5));
    }
+#endif
+    COMPUTE_FORCE
+#if USE_SWITCH
+    tempForce = tempForce*switchValue - tempEnergy*switchDeriv;
+    tempEnergy *= switchValue;
 #endif
    dEdR += tempForce*invR;
 }
--- a/platforms/cuda/src/kernels/nonbonded.cu
+++ b/platforms/cuda/src/kernels/nonbonded.cu
@@ -113,6 +113,7 @@ extern "C" __global__ void computeNonbonded(
    const unsigned int tgx = threadIdx.x & (TILE_SIZE-1); // index within the warp
    const unsigned int tbx = threadIdx.x - tgx;           // block warpIndex
    mixed energy = 0;
+    INIT_DERIVATIVES
    // used shared memory if the device cannot shuffle
 #ifndef ENABLE_SHUFFLE
    __shared__ AtomData localData[THREAD_BLOCK_SIZE];
@@ -175,6 +176,7 @@ extern "C" __global__ void computeNonbonded(
                bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS || !(excl & 0x1));
 #endif
                real tempEnergy = 0.0f;
+                const real interactionScale = 0.5f;
                COMPUTE_INTERACTION
                energy += 0.5f*tempEnergy;
 #ifdef INCLUDE_FORCES
@@ -243,6 +245,7 @@ extern "C" __global__ void computeNonbonded(
                bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS || !(excl & 0x1));
 #endif
                real tempEnergy = 0.0f;
+                const real interactionScale = 1.0f;
                COMPUTE_INTERACTION
                energy += tempEnergy;
 #ifdef INCLUDE_FORCES
@@ -448,6 +451,7 @@ extern "C" __global__ void computeNonbonded(
                    bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS);
 #endif
                    real tempEnergy = 0.0f;
+                    const real interactionScale = 1.0f;
                    COMPUTE_INTERACTION
                    energy += tempEnergy;
 #ifdef INCLUDE_FORCES
@@ -518,6 +522,7 @@ extern "C" __global__ void computeNonbonded(
                    bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS);
 #endif
                    real tempEnergy = 0.0f;
+                    const real interactionScale = 1.0f;
                    COMPUTE_INTERACTION
                    energy += tempEnergy;
 #ifdef INCLUDE_FORCES
@@ -586,4 +591,5 @@ extern "C" __global__ void computeNonbonded(
 #ifdef INCLUDE_ENERGY
    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
 #endif
+    SAVE_DERIVATIVES
 }
\ No newline at end of file
--- a/platforms/opencl/include/OpenCLBondedUtilities.h
+++ b/platforms/opencl/include/OpenCLBondedUtilities.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2011-2015 Stanford University and the Authors.      *
+ * Portions copyright (c) 2011-2016 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -100,6 +100,15 @@ public:
     * refer to it by this name.
     */
    std::string addArgument(cl::Memory& data, const std::string& type);
+    /**
+     * Register that the interaction kernel will be computing the derivative of the potential energy
+     * with respect to a parameter.
+     * 
+     * @param param   the name of the parameter
+     * @return the variable that will be used to accumulate the derivative.  Any code you pass to addInteraction() should
+     * add its contributions to this variable.
+     */
+    std::string addEnergyParameterDerivative(const std::string& param);
    /**
     * Add some OpenCL code that should be included in the program, before the start of the kernel.
     * This can be used, for example, to define functions that will be called by the kernel.
@@ -137,6 +146,7 @@ private:
    std::vector<OpenCLArray*> atomIndices;
    std::vector<OpenCLArray*> bufferIndices;
    std::vector<std::string> prefixCode;
+    std::vector<std::string> energyParameterDerivatives;
    int numForceBuffers, maxBonds, allGroups;
    bool hasInitializedKernels;
 };

--- a/platforms/opencl/include/OpenCLContext.h
+++ b/platforms/opencl/include/OpenCLContext.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2015 Stanford University and the Authors.      *
+ * Portions copyright (c) 2009-2016 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -264,6 +264,12 @@ public:
    OpenCLArray& getEnergyBuffer() {
        return *energyBuffer;
    }
+    /**
+     * Get the array which contains the buffer in which derivatives of the energy with respect to parameters are computed.
+     */
+    OpenCLArray& getEnergyParamDerivBuffer() {
+        return *energyParamDerivBuffer;
+    }
    /**
     * Get a pointer to a block of pinned memory that can be used for efficient transfers between host and device.
     * This is guaranteed to be at least as large as any of the arrays returned by methods of this class.
@@ -659,6 +665,27 @@ public:
    std::vector<ForcePostComputation*>& getPostComputations() {
        return postComputations;
    }
+    /**
+     * Get the names of all parameters with respect to which energy derivatives are computed.
+     */
+    const std::vector<std::string>& getEnergyParamDerivNames() const {
+        return energyParamDerivNames;
+    }
+    /**
+     * Get a workspace data structure used for accumulating the values of derivatives of the energy
+     * with respect to parameters.
+     */
+    std::map<std::string, double>& getEnergyParamDerivWorkspace() {
+        return energyParamDerivWorkspace;
+    }
+    /**
+     * Register that the derivative of potential energy with respect to a context parameter
+     * will need to be calculated.  If this is called multiple times for a single parameter,
+     * it is only added to the list once.
+     * 
+     * @param param    the name of the parameter to add
+     */
+    void addEnergyParameterDerivative(const std::string& param);
    /**
     * Mark that the current molecule definitions (and hence the atom order) may be invalid.
     * This should be called whenever force field parameters change.  It will cause the definitions
@@ -725,7 +752,10 @@ private:
    OpenCLArray* forceBuffers;
    OpenCLArray* longForceBuffer;
    OpenCLArray* energyBuffer;
+    OpenCLArray* energyParamDerivBuffer;
    OpenCLArray* atomIndexDevice;
+    std::vector<std::string> energyParamDerivNames;
+    std::map<std::string, double> energyParamDerivWorkspace;
    std::vector<int> atomIndex;
    std::vector<cl::Memory*> autoclearBuffers;
    std::vector<int> autoclearBufferSizes;

--- a/platforms/opencl/include/OpenCLKernels.h
+++ b/platforms/opencl/include/OpenCLKernels.h
@@ -141,6 +141,12 @@ public:
     * @param forces  on exit, this contains the forces
     */
    void getForces(ContextImpl& context, std::vector<Vec3>& forces);
+    /**
+     * Get the current derivatives of the energy with respect to context parameters.
+     *
+     * @param derivs  on exit, this contains the derivatives
+     */
+    void getEnergyParameterDerivatives(ContextImpl& context, std::map<std::string, double>& derivs);
    /**
     * Get the current periodic box vectors.
     *
@@ -709,6 +715,7 @@ private:
    std::vector<cl_float> globalParamValues;
    std::vector<OpenCLArray*> tabulatedFunctions;
    double longRangeCoefficient;
+    std::vector<double> longRangeCoefficientDerivs;
    bool hasInitializedLongRangeCorrection, hasInitializedKernel;
    int numGroupThreadBlocks;
    CustomNonbondedForce* forceCopy;
@@ -801,13 +808,15 @@ public:
    void copyParametersToContext(ContextImpl& context, const CustomGBForce& force);
 private:
    double cutoff;
-    bool hasInitializedKernels, needParameterGradient;
+    bool hasInitializedKernels, needParameterGradient, needEnergyParamDerivs;
    int maxTiles, numComputedValues;
    OpenCLContext& cl;
    OpenCLParameterSet* params;
    OpenCLParameterSet* computedValues;
    OpenCLParameterSet* energyDerivs;
    OpenCLParameterSet* energyDerivChain;
+    std::vector<OpenCLParameterSet*> dValuedParam;
+    std::vector<OpenCLArray*> dValue0dParam;
    OpenCLArray* longEnergyDerivs;
    OpenCLArray* globals;
    OpenCLArray* valueBuffers;
@@ -953,6 +962,7 @@ public:

 private:
    int numGroups, numBonds;
+    bool needEnergyParamDerivs;
    OpenCLContext& cl;
    OpenCLParameterSet* params;
    OpenCLArray* globals;
@@ -1273,7 +1283,7 @@ public:
    enum GlobalTargetType {DT, VARIABLE, PARAMETER};
    OpenCLIntegrateCustomStepKernel(std::string name, const Platform& platform, OpenCLContext& cl) : IntegrateCustomStepKernel(name, platform), cl(cl),
            hasInitializedKernels(false), localValuesAreCurrent(false), globalValues(NULL), sumBuffer(NULL), summedValue(NULL), uniformRandoms(NULL),
-            randomSeed(NULL), perDofValues(NULL) {
+            randomSeed(NULL), perDofEnergyParamDerivs(NULL), perDofValues(NULL), needsEnergyParamDerivs(false) {
    }
    ~OpenCLIntegrateCustomStepKernel();
    /**
@@ -1338,8 +1348,11 @@ public:
 private:
    class ReorderListener;
    class GlobalTarget;
+    class DerivFunction;
    std::string createPerDofComputation(const std::string& variable, const Lepton::ParsedExpression& expr, int component, CustomIntegrator& integrator, const std::string& forceName, const std::string& energyName);
    void prepareForComputation(ContextImpl& context, CustomIntegrator& integrator, bool& forcesAreValid);
+    Lepton::ExpressionTreeNode replaceDerivFunctions(const Lepton::ExpressionTreeNode& node, OpenMM::ContextImpl& context);
+    void findExpressionsForDerivs(const Lepton::ExpressionTreeNode& node, std::vector<std::pair<Lepton::ExpressionTreeNode, std::string> >& variableNodes);
    void recordGlobalValue(double value, GlobalTarget target);
    void recordChangedParameters(ContextImpl& context);
    bool evaluateCondition(int step);
@@ -1347,18 +1360,23 @@ private:
    double energy;
    float energyFloat;
    int numGlobalVariables;
-    bool hasInitializedKernels, deviceValuesAreCurrent, deviceGlobalsAreCurrent, modifiesParameters, keNeedsForce, hasAnyConstraints;
+    bool hasInitializedKernels, deviceValuesAreCurrent, deviceGlobalsAreCurrent, modifiesParameters, keNeedsForce, hasAnyConstraints, needsEnergyParamDerivs;
    mutable bool localValuesAreCurrent;
    OpenCLArray* globalValues;
    OpenCLArray* sumBuffer;
    OpenCLArray* summedValue;
    OpenCLArray* uniformRandoms;
    OpenCLArray* randomSeed;
+    OpenCLArray* perDofEnergyParamDerivs;
    std::map<int, OpenCLArray*> savedForces;
    std::set<int> validSavedForces;
    OpenCLParameterSet* perDofValues;
    mutable std::vector<std::vector<cl_float> > localPerDofValuesFloat;
    mutable std::vector<std::vector<cl_double> > localPerDofValuesDouble;
+    std::map<std::string, double> energyParamDerivs;
+    std::vector<std::string> perDofEnergyParamDerivNames;
+    std::vector<cl_float> localPerDofEnergyParamDerivsFloat;
+    std::vector<cl_double> localPerDofEnergyParamDerivsDouble;
    std::vector<float> globalValuesFloat;
    std::vector<double> globalValuesDouble;
    std::vector<double> initialGlobalVariables;

--- a/platforms/opencl/include/OpenCLNonbondedUtilities.h
+++ b/platforms/opencl/include/OpenCLNonbondedUtilities.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2013 Stanford University and the Authors.      *
+ * Portions copyright (c) 2009-2016 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -88,6 +88,15 @@ public:
     * Add an array (other than a per-atom parameter) that should be passed as an argument to the default interaction kernel.
     */
    void addArgument(const ParameterInfo& parameter);
+    /**
+     * Register that the interaction kernel will be computing the derivative of the potential energy
+     * with respect to a parameter.
+     * 
+     * @param param   the name of the parameter
+     * @return the variable that will be used to accumulate the derivative.  Any code you pass to addInteraction() should
+     * add its contributions to this variable.
+     */
+    std::string addEnergyParameterDerivative(const std::string& param);
    /**
     * Specify the list of exclusions that an interaction outside the default kernel will depend on.
     * 
@@ -287,6 +296,7 @@ private:
    std::vector<std::vector<int> > atomExclusions;
    std::vector<ParameterInfo> parameters;
    std::vector<ParameterInfo> arguments;
+    std::vector<std::string> energyParameterDerivatives;
    std::map<int, double> groupCutoff;
    std::map<int, std::string> groupKernelSource;
    double lastCutoff;

--- a/platforms/opencl/src/OpenCLBondedUtilities.cpp
+++ b/platforms/opencl/src/OpenCLBondedUtilities.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2011-2015 Stanford University and the Authors.      *
+ * Portions copyright (c) 2011-2016 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -56,12 +56,25 @@ void OpenCLBondedUtilities::addInteraction(const vector<vector<int> >& atoms, co
    }
 }

-std::string OpenCLBondedUtilities::addArgument(cl::Memory& data, const string& type) {
+string OpenCLBondedUtilities::addArgument(cl::Memory& data, const string& type) {
    arguments.push_back(&data);
    argTypes.push_back(type);
    return "customArg"+context.intToString(arguments.size());
 }

+string OpenCLBondedUtilities::addEnergyParameterDerivative(const string& param) {
+    // See if the parameter has already been added.
+    
+    int index;
+    for (index = 0; index < energyParameterDerivatives.size(); index++)
+        if (param == energyParameterDerivatives[index])
+            break;
+    if (index == energyParameterDerivatives.size())
+        energyParameterDerivatives.push_back(param);
+    context.addEnergyParameterDerivative(param);
+    return string("energyParamDeriv")+context.intToString(index);
+}
+
 void OpenCLBondedUtilities::addPrefixCode(const string& source) {
    for (int i = 0; i < (int) prefixCode.size(); i++)
        if (prefixCode[i] == source)
@@ -190,13 +203,23 @@ void OpenCLBondedUtilities::initialize(const System& system) {
        }
        for (int i = 0; i < (int) arguments.size(); i++)
            s<<", __global "<<argTypes[i]<<"* customArg"<<(i+1);
+        if (energyParameterDerivatives.size() > 0)
+            s<<", __global mixed* restrict energyParamDerivs";
        s<<") {\n";
        s<<"mixed energy = 0;\n";
+        for (int i = 0; i < energyParameterDerivatives.size(); i++)
+            s<<"mixed energyParamDeriv"<<i<<" = 0;\n";
        for (int i = 0; i < setSize; i++) {
            int force = set[i];
            s<<createForceSource(i, forceAtoms[force].size(), forceAtoms[force][0].size(), forceGroup[force], forceSource[force]);
        }
        s<<"energyBuffer[get_global_id(0)] += energy;\n";
+        const vector<string>& allParamDerivNames = context.getEnergyParamDerivNames();
+        int numDerivs = allParamDerivNames.size();
+        for (int i = 0; i < energyParameterDerivatives.size(); i++)
+            for (int index = 0; index < numDerivs; index++)
+                if (allParamDerivNames[index] == energyParameterDerivatives[i])
+                    s<<"energyParamDerivs[get_global_id(0)*"<<numDerivs<<"+"<<index<<"] += energyParamDeriv"<<i<<";\n";
        s<<"}\n";
        map<string, string> defines;
        defines["PADDED_NUM_ATOMS"] = context.intToString(context.getPaddedNumAtoms());
@@ -274,6 +297,8 @@ void OpenCLBondedUtilities::computeInteractions(int groups) {
            }
            for (int j = 0; j < (int) arguments.size(); j++)
                kernel.setArg<cl::Memory>(index++, *arguments[j]);
+            if (energyParameterDerivatives.size() > 0)
+                kernel.setArg<cl::Memory>(index++, context.getEnergyParamDerivBuffer().getDeviceBuffer());
        }
    }
    for (int i = 0; i < (int) kernels.size(); i++) {

--- a/platforms/opencl/src/OpenCLContext.cpp
+++ b/platforms/opencl/src/OpenCLContext.cpp
@@ -69,7 +69,7 @@ static void CL_CALLBACK errorCallback(const char* errinfo, const void* private_i

 OpenCLContext::OpenCLContext(const System& system, int platformIndex, int deviceIndex, const string& precision, OpenCLPlatform::PlatformData& platformData) :
        system(system), time(0.0), platformData(platformData), stepCount(0), computeForceCount(0), stepsSinceReorder(99999), atomsWereReordered(false), posq(NULL),
-        posqCorrection(NULL), velm(NULL), forceBuffers(NULL), longForceBuffer(NULL), energyBuffer(NULL), atomIndexDevice(NULL), integration(NULL),
+        posqCorrection(NULL), velm(NULL), forceBuffers(NULL), longForceBuffer(NULL), energyBuffer(NULL), energyParamDerivBuffer(NULL), atomIndexDevice(NULL), integration(NULL),
        expression(NULL), bonded(NULL), nonbonded(NULL), thread(NULL) {
    if (precision == "single") {
        useDoublePrecision = false;
@@ -435,6 +435,8 @@ OpenCLContext::~OpenCLContext() {
        delete longForceBuffer;
    if (energyBuffer != NULL)
        delete energyBuffer;
+    if (energyParamDerivBuffer != NULL)
+        delete energyParamDerivBuffer;
    if (atomIndexDevice != NULL)
        delete atomIndexDevice;
    if (integration != NULL)
@@ -455,15 +457,16 @@ void OpenCLContext::initialize() {
    numForceBuffers = std::max(numForceBuffers, bonded->getNumForceBuffers());
    for (int i = 0; i < (int) forces.size(); i++)
        numForceBuffers = std::max(numForceBuffers, forces[i]->getRequiredForceBuffers());
+    int energyBufferSize = max(numThreadBlocks*ThreadBlockSize, nonbonded->getNumEnergyBuffers());
    if (useDoublePrecision) {
        forceBuffers = OpenCLArray::create<mm_double4>(*this, paddedNumAtoms*numForceBuffers, "forceBuffers");
        force = OpenCLArray::create<mm_double4>(*this, &forceBuffers->getDeviceBuffer(), paddedNumAtoms, "force");
-        energyBuffer = OpenCLArray::create<cl_double>(*this, max(numThreadBlocks*ThreadBlockSize, nonbonded->getNumEnergyBuffers()), "energyBuffer");
+        energyBuffer = OpenCLArray::create<cl_double>(*this, energyBufferSize, "energyBuffer");
    }
    else {
        forceBuffers = OpenCLArray::create<mm_float4>(*this, paddedNumAtoms*numForceBuffers, "forceBuffers");
        force = OpenCLArray::create<mm_float4>(*this, &forceBuffers->getDeviceBuffer(), paddedNumAtoms, "force");
-        energyBuffer = OpenCLArray::create<cl_double>(*this, max(numThreadBlocks*ThreadBlockSize, nonbonded->getNumEnergyBuffers()), "energyBuffer");
+        energyBuffer = OpenCLArray::create<cl_double>(*this, energyBufferSize, "energyBuffer");
    }
    if (supports64BitGlobalAtomics) {
        longForceBuffer = OpenCLArray::create<cl_long>(*this, 3*paddedNumAtoms, "longForceBuffer");
@@ -475,7 +478,15 @@ void OpenCLContext::initialize() {
    }
    addAutoclearBuffer(*forceBuffers);
    addAutoclearBuffer(*energyBuffer);
-    int bufferBytes = max(velm->getSize()*velm->getElementSize(), energyBuffer->getSize()*energyBuffer->getElementSize());
+    int numEnergyParamDerivs = energyParamDerivNames.size();
+    if (numEnergyParamDerivs > 0) {
+        if (useDoublePrecision || useMixedPrecision)
+            energyParamDerivBuffer = OpenCLArray::create<cl_double>(*this, numEnergyParamDerivs*energyBufferSize, "energyParamDerivBuffer");
+        else
+            energyParamDerivBuffer = OpenCLArray::create<cl_float>(*this, numEnergyParamDerivs*energyBufferSize, "energyParamDerivBuffer");
+        addAutoclearBuffer(*energyParamDerivBuffer);
+    }
+    int bufferBytes = max(velm->getSize()*velm->getElementSize(), energyBufferSize*energyBuffer->getElementSize());
    pinnedBuffer = new cl::Buffer(context, CL_MEM_ALLOC_HOST_PTR, bufferBytes);
    pinnedMemory = currentQueue.enqueueMapBuffer(*pinnedBuffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, bufferBytes);
    for (int i = 0; i < numAtoms; i++) {
@@ -1229,6 +1240,15 @@ void OpenCLContext::addPostComputation(ForcePostComputation* computation) {
    postComputations.push_back(computation);
 }

+void OpenCLContext::addEnergyParameterDerivative(const string& param) {
+    // See if this parameter has already been registered.
+    
+    for (int i = 0; i < energyParamDerivNames.size(); i++)
+        if (param == energyParamDerivNames[i])
+            return;
+    energyParamDerivNames.push_back(param);
+}
+
 struct OpenCLContext::WorkThread::ThreadData {
    ThreadData(std::queue<OpenCLContext::WorkTask*>& tasks, bool& waiting,  bool& finished,
            pthread_mutex_t& queueLock, pthread_cond_t& waitForTaskCondition, pthread_cond_t& queueEmptyCondition) :

--- a/platforms/opencl/src/OpenCLKernels.cpp
+++ b/platforms/opencl/src/OpenCLKernels.cpp
--- a/platforms/opencl/src/OpenCLNonbondedUtilities.cpp
+++ b/platforms/opencl/src/OpenCLNonbondedUtilities.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2015 Stanford University and the Authors.      *
+ * Portions copyright (c) 2009-2016 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -162,6 +162,19 @@ void OpenCLNonbondedUtilities::addArgument(const ParameterInfo& parameter) {
    arguments.push_back(parameter);
 }

+string OpenCLNonbondedUtilities::addEnergyParameterDerivative(const string& param) {
+    // See if the parameter has already been added.
+    
+    int index;
+    for (index = 0; index < energyParameterDerivatives.size(); index++)
+        if (param == energyParameterDerivatives[index])
+            break;
+    if (index == energyParameterDerivatives.size())
+        energyParameterDerivatives.push_back(param);
+    context.addEnergyParameterDerivative(param);
+    return string("energyParamDeriv")+context.intToString(index);
+}
+
 void OpenCLNonbondedUtilities::requestExclusions(const vector<vector<int> >& exclusionList) {
    if (anyExclusions) {
        bool sameExclusions = (exclusionList.size() == atomExclusions.size());
@@ -591,6 +604,8 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
            args << arguments[i].getName();
        }
    }
+    if (energyParameterDerivatives.size() > 0)
+        args << ", __global mixed* restrict energyParamDerivs";
    replacements["PARAMETER_ARGUMENTS"] = args.str();
    stringstream loadLocal1;
    for (int i = 0; i < (int) params.size(); i++) {
@@ -641,6 +656,18 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
        }
    }
    replacements["LOAD_ATOM2_PARAMETERS"] = load2j.str();
+    stringstream initDerivs;
+    for (int i = 0; i < energyParameterDerivatives.size(); i++)
+        initDerivs<<"mixed energyParamDeriv"<<i<<" = 0;\n";
+    replacements["INIT_DERIVATIVES"] = initDerivs.str();
+    stringstream saveDerivs;
+    const vector<string>& allParamDerivNames = context.getEnergyParamDerivNames();
+    int numDerivs = allParamDerivNames.size();
+    for (int i = 0; i < energyParameterDerivatives.size(); i++)
+        for (int index = 0; index < numDerivs; index++)
+            if (allParamDerivNames[index] == energyParameterDerivatives[i])
+                saveDerivs<<"energyParamDerivs[get_global_id(0)*"<<numDerivs<<"+"<<index<<"] += energyParamDeriv"<<i<<";\n";
+    replacements["SAVE_DERIVATIVES"] = saveDerivs.str();
    map<string, string> defines;
    if (useCutoff)
        defines["USE_CUTOFF"] = "1";
@@ -716,5 +743,7 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
    for (int i = 0; i < (int) arguments.size(); i++) {
        kernel.setArg<cl::Memory>(index++, arguments[i].getMemory());
    }
+    if (energyParameterDerivatives.size() > 0)
+        kernel.setArg<cl::Memory>(index++, context.getEnergyParamDerivBuffer().getDeviceBuffer());
    return kernel;
 }