Continuing CUDA implementation of parameter derivatives

4949017b · Peter Eastman · eae8def5 · 4949017b · 4949017b · 4949017b
Commit 4949017b authored Jul 27, 2016 by Peter Eastman
9 changed files
--- a/platforms/cuda/include/CudaKernels.h
+++ b/platforms/cuda/include/CudaKernels.h
@@ -826,13 +826,15 @@ public:
    void copyParametersToContext(ContextImpl& context, const CustomGBForce& force);
 private:
    double cutoff;
-    bool hasInitializedKernels, needParameterGradient;
+    bool hasInitializedKernels, needParameterGradient, needEnergyParamDerivs;
    int maxTiles, numComputedValues;
    CudaContext& cu;
    CudaParameterSet* params;
    CudaParameterSet* computedValues;
    CudaParameterSet* energyDerivs;
    CudaParameterSet* energyDerivChain;
+    std::vector<CudaParameterSet*> dValuedParam;
+    std::vector<CudaArray*> dValue0dParam;
    CudaArray* longEnergyDerivs;
    CudaArray* globals;
    CudaArray* valueBuffers;

--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
--- a/platforms/cuda/src/kernels/customGBEnergyN2.cu
+++ b/platforms/cuda/src/kernels/customGBEnergyN2.cu
@@ -28,6 +28,7 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
    const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
    const unsigned int tbx = threadIdx.x - tgx;
    mixed energy = 0;
+    INIT_PARAM_DERIVS
    __shared__ AtomData localData[THREAD_BLOCK_SIZE];

    // First loop: process tiles that contain exclusions.
@@ -69,6 +70,7 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
                    atom2 = y*TILE_SIZE+j;
                    real dEdR = 0;
                    real tempEnergy = 0;
+                    const real interactionScale = 0.5f;
 #ifdef USE_EXCLUSIONS
                    bool isExcluded = !(excl & 0x1);
 #endif
@@ -120,6 +122,7 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
                    atom2 = y*TILE_SIZE+tj;
                    real dEdR = 0;
                    real tempEnergy = 0;
+                    const real interactionScale = 1;
 #ifdef USE_EXCLUSIONS
                    bool isExcluded = !(excl & 0x1);
 #endif
@@ -266,6 +269,7 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
                        atom2 = atomIndices[tbx+tj];
                        real dEdR = 0;
                        real tempEnergy = 0;
+                        const real interactionScale = 1;
                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
                            COMPUTE_INTERACTION
                            dEdR /= -r;
@@ -309,6 +313,7 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
                        atom2 = atomIndices[tbx+tj];
                        real dEdR = 0;
                        real tempEnergy = 0;
+                        const real interactionScale = 1;
                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
                            COMPUTE_INTERACTION
                            dEdR /= -r;
@@ -353,4 +358,5 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
        pos++;
    }
    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
+    SAVE_PARAM_DERIVS
 }
--- a/platforms/cuda/src/kernels/customGBEnergyPerParticle.cu
+++ b/platforms/cuda/src/kernels/customGBEnergyPerParticle.cu
@@ -5,6 +5,7 @@
 extern "C" __global__ void computePerParticleEnergy(long long* __restrict__ forceBuffers, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq
        PARAMETER_ARGUMENTS) {
    mixed energy = 0;
+    INIT_PARAM_DERIVS
    for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
        // Load the derivatives

@@ -17,4 +18,5 @@ extern "C" __global__ void computePerParticleEnergy(long long* __restrict__ forc
        COMPUTE_ENERGY
    }
    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
+    SAVE_PARAM_DERIVS
 }
--- a/platforms/cuda/src/kernels/customGBGradientChainRule.cu
+++ b/platforms/cuda/src/kernels/customGBGradientChainRule.cu
@@ -4,6 +4,7 @@

 extern "C" __global__ void computeGradientChainRuleTerms(long long* __restrict__ forceBuffers, const real4* __restrict__ posq
        PARAMETER_ARGUMENTS) {
+    INIT_PARAM_DERIVS
    const real scale = RECIP((real) 0x100000000);
    for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
        real4 pos = posq[index];
@@ -13,4 +14,5 @@ extern "C" __global__ void computeGradientChainRuleTerms(long long* __restrict__
        forceBuffers[index+PADDED_NUM_ATOMS] = (long long) (force.y*0x100000000);
        forceBuffers[index+PADDED_NUM_ATOMS*2] = (long long) (force.z*0x100000000);
    }
+    SAVE_PARAM_DERIVS
 }
--- a/platforms/cuda/src/kernels/customGBValueN2.cu
+++ b/platforms/cuda/src/kernels/customGBValueN2.cu
@@ -73,6 +73,7 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
                        COMPUTE_VALUE
                    }
                    value += tempValue1;
+                    ADD_TEMP_DERIVS1
 #ifdef USE_CUTOFF
                }
 #endif
@@ -121,6 +122,8 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
                    }
                    value += tempValue1;
                    localData[tbx+tj].value += tempValue2;
+                    ADD_TEMP_DERIVS1
+                    ADD_TEMP_DERIVS2
 #ifdef USE_CUTOFF
                }
 #endif
@@ -133,11 +136,13 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const

        // Write results.

-        unsigned int offset = x*TILE_SIZE + tgx;
-        atomicAdd(&global_value[offset], static_cast<unsigned long long>((long long) (value*0x100000000)));
+        unsigned int offset1 = x*TILE_SIZE + tgx;
+        atomicAdd(&global_value[offset1], static_cast<unsigned long long>((long long) (value*0x100000000)));
+        STORE_PARAM_DERIVS1
        if (x != y) {
-            offset = y*TILE_SIZE + tgx;
-            atomicAdd(&global_value[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].value*0x100000000)));
+            unsigned int offset2 = y*TILE_SIZE + tgx;
+            atomicAdd(&global_value[offset2], static_cast<unsigned long long>((long long) (localData[threadIdx.x].value*0x100000000)));
+            STORE_PARAM_DERIVS2
        }
    }

@@ -244,6 +249,8 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
                        }
                        value += tempValue1;
                        localData[tbx+tj].value += tempValue2;
+                        ADD_TEMP_DERIVS1
+                        ADD_TEMP_DERIVS2
                    }
                    tj = (tj + 1) & (TILE_SIZE - 1);
                }
@@ -276,6 +283,8 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
                        }
                        value += tempValue1;
                        localData[tbx+tj].value += tempValue2;
+                        ADD_TEMP_DERIVS1
+                        ADD_TEMP_DERIVS2
 #ifdef USE_CUTOFF
                    }
 #endif
@@ -285,14 +294,19 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
        
            // Write results.

-            atomicAdd(&global_value[atom1], static_cast<unsigned long long>((long long) (value*0x100000000)));
+            unsigned int offset1 = atom1;
+            atomicAdd(&global_value[offset1], static_cast<unsigned long long>((long long) (value*0x100000000)));
+            STORE_PARAM_DERIVS1
 #ifdef USE_CUTOFF
            unsigned int atom2 = atomIndices[threadIdx.x];
 #else
            unsigned int atom2 = y*TILE_SIZE + tgx;
 #endif
-            if (atom2 < PADDED_NUM_ATOMS)
-                atomicAdd(&global_value[atom2], static_cast<unsigned long long>((long long) (localData[threadIdx.x].value*0x100000000)));
+            if (atom2 < PADDED_NUM_ATOMS) {
+                unsigned int offset2 = atom2;
+                atomicAdd(&global_value[offset2], static_cast<unsigned long long>((long long) (localData[threadIdx.x].value*0x100000000)));
+                STORE_PARAM_DERIVS2
+            }
        }
        pos++;
    }

--- a/platforms/cuda/src/kernels/customGBValuePerParticle.cu
+++ b/platforms/cuda/src/kernels/customGBValuePerParticle.cu
@@ -8,6 +8,7 @@ extern "C" __global__ void computePerParticleValues(real4* posq, long long* valu
        // Load the pairwise value

        real sum = valueBuffers[index]/(real) 0x100000000;
+        REDUCE_PARAM0_DERIV
        
        // Now calculate other values


--- a/platforms/opencl/src/OpenCLKernels.cpp
+++ b/platforms/opencl/src/OpenCLKernels.cpp
@@ -3301,7 +3301,7 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
            string variableName = "dValuedParam_0_"+cl.intToString(i);
            if (useLong) {
                extraArgs << ", __global const long* restrict dValue0dParam" << i;
-                deriv0 << "real " << variableName << " = (1.0f/0x100000000)*dValue0dParam[index];\n";
+                deriv0 << "real " << variableName << " = (1.0f/0x100000000)*dValue0dParam" << i << "[index];\n";
            }
            else {
                extraArgs << ", __global const real* restrict dValue0dParam" << i;

--- a/platforms/opencl/src/kernels/customGBValueN2.cl
+++ b/platforms/opencl/src/kernels/customGBValueN2.cl
@@ -320,7 +320,7 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
            unsigned int atom2 = y*TILE_SIZE + tgx;
 #endif
 #ifdef SUPPORTS_64_BIT_ATOMICS
-            unsigned in offset1 = atom1;
+            unsigned int offset1 = atom1;
            atom_add(&global_value[offset1], (long) (value*0x100000000));
            STORE_PARAM_DERIVS1
            if (atom2 < PADDED_NUM_ATOMS) {