Began overhaul of CUDA CustomIntegrator in preparation for supporting flow control

59c809c0 · peastman · 44b96f0c · 59c809c0 · 59c809c0 · 59c809c0
Commit 59c809c0 authored Jul 16, 2015 by peastman
8 changed files
--- a/platforms/cpu/include/CompiledExpressionSet.h
+++ b/platforms/cpu/include/CompiledExpressionSet.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2014 Stanford University and the Authors.           *
+ * Portions copyright (c) 2014-2015 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -33,7 +33,7 @@
 * -------------------------------------------------------------------------- */

 #include "lepton/CompiledExpression.h"
-#include "windowsExportCpu.h"
+#include "windowsExport.h"
 #include <string>
 #include <vector>

@@ -42,7 +42,7 @@ namespace OpenMM {
 /**
 * This class simplifies the management of a set of related CompiledExpressions that share variables.
 */
-class OPENMM_EXPORT_CPU CompiledExpressionSet {
+class OPENMM_EXPORT CompiledExpressionSet {
 public:
    CompiledExpressionSet();
    /**
@@ -60,6 +60,10 @@ public:
     * @param value    the value to set it to
     */
    void setVariable(int index, double value);
+    /**
+     * Get the total number of variables for which indices have been allocated.
+     */
+    int getNumVariables() const;
 private:
    std::vector<Lepton::CompiledExpression*> expressions;
    std::vector<std::string> variables;

--- a/platforms/cpu/src/CompiledExpressionSet.cpp
+++ b/platforms/cpu/src/CompiledExpressionSet.cpp
-/* Portions copyright (c) 2014 Stanford University and Simbios.
+/* Portions copyright (c) 2014-2015 Stanford University and Simbios.
 * Contributors: Peter Eastman
 *
 * Permission is hereby granted, free of charge, to any person obtaining
@@ -21,7 +21,7 @@
 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */

-#include "CompiledExpressionSet.h"
+#include "openmm/internal/CompiledExpressionSet.h"

 using namespace OpenMM;
 using namespace Lepton;
@@ -54,3 +54,7 @@ void CompiledExpressionSet::setVariable(int index, double value) {
    for (int i = 0; i < (int) variableReferences[index].size(); i++)
        *variableReferences[index][i] = value;
 }
+
+int CompiledExpressionSet::getNumVariables() const {
+    return variables.size();
+}
--- a/platforms/cpu/include/CpuCustomGBForce.h
+++ b/platforms/cpu/include/CpuCustomGBForce.h
@@ -25,10 +25,10 @@
 #ifndef OPENMM_CPU_CUSTOM_GB_FORCE_H__
 #define OPENMM_CPU_CUSTOM_GB_FORCE_H__

-#include "CompiledExpressionSet.h"
 #include "CpuNeighborList.h"
 #include "lepton/CompiledExpression.h"
 #include "openmm/CustomGBForce.h"
+#include "openmm/internal/CompiledExpressionSet.h"
 #include "openmm/internal/ThreadPool.h"
 #include "openmm/internal/vectorize.h"
 #include <map>

--- a/platforms/cpu/include/CpuCustomManyParticleForce.h
+++ b/platforms/cpu/include/CpuCustomManyParticleForce.h
@@ -27,9 +27,9 @@

 #include "ReferenceForce.h"
 #include "ReferenceBondIxn.h"
-#include "CompiledExpressionSet.h"
 #include "CpuNeighborList.h"
 #include "openmm/CustomManyParticleForce.h"
+#include "openmm/internal/CompiledExpressionSet.h"
 #include "openmm/internal/ThreadPool.h"
 #include "openmm/internal/vectorize.h"
 #include "lepton/CompiledExpression.h"

--- a/platforms/cuda/include/CudaKernels.h
+++ b/platforms/cuda/include/CudaKernels.h
@@ -35,6 +35,9 @@
 #include "CudaSort.h"
 #include "openmm/kernels.h"
 #include "openmm/System.h"
+#include "openmm/internal/CompiledExpressionSet.h"
+#include "openmm/internal/CustomIntegratorUtilities.h"
+#include "lepton/CompiledExpression.h"
 #include <cufft.h>

 namespace OpenMM {
@@ -1213,6 +1216,7 @@ private:
 */
 class CudaIntegrateCustomStepKernel : public IntegrateCustomStepKernel {
 public:
+    enum GlobalTargetType {DT, VARIABLE, PARAMETER};
    CudaIntegrateCustomStepKernel(std::string name, const Platform& platform, CudaContext& cu) : IntegrateCustomStepKernel(name, platform), cu(cu),
            hasInitializedKernels(false), localValuesAreCurrent(false), globalValues(NULL), contextParameterValues(NULL), sumBuffer(NULL), potentialEnergy(NULL),
            kineticEnergy(NULL), uniformRandoms(NULL), randomSeed(NULL), perDofValues(NULL) {
@@ -1279,15 +1283,17 @@ public:
    void setPerDofVariable(ContextImpl& context, int variable, const std::vector<Vec3>& values);
 private:
    class ReorderListener;
+    class GlobalTarget;
    std::string createGlobalComputation(const std::string& variable, const Lepton::ParsedExpression& expr, CustomIntegrator& integrator, const std::string& energyName);
    std::string createPerDofComputation(const std::string& variable, const Lepton::ParsedExpression& expr, int component, CustomIntegrator& integrator, const std::string& forceName, const std::string& energyName);
    void prepareForComputation(ContextImpl& context, CustomIntegrator& integrator, bool& forcesAreValid);
+    void recordGlobalValue(double value, GlobalTarget target);
    void recordChangedParameters(ContextImpl& context);
    CudaContext& cu;
    double prevStepSize, energy;
    float energyFloat;
    int numGlobalVariables;
-    bool hasInitializedKernels, deviceValuesAreCurrent, modifiesParameters, keNeedsForce;
+    bool hasInitializedKernels, deviceValuesAreCurrent, deviceGlobalsAreCurrent, modifiesParameters, keNeedsForce;
    mutable bool localValuesAreCurrent;
    CudaArray* globalValues;
    CudaArray* contextParameterValues;
@@ -1303,19 +1309,43 @@ private:
    mutable std::vector<std::vector<double> > localPerDofValuesDouble;
    std::vector<float> contextValuesFloat;
    std::vector<double> contextValuesDouble;
+    std::vector<float> globalValuesFloat;
+    std::vector<double> globalValuesDouble;
+    std::vector<double> initialGlobalVariables;
    std::vector<std::vector<CUfunction> > kernels;
    std::vector<std::vector<std::vector<void*> > > kernelArgs;
    std::vector<void*> kineticEnergyArgs;
    CUfunction randomKernel, kineticEnergyKernel, sumKineticEnergyKernel;
    std::vector<CustomIntegrator::ComputationType> stepType;
+    std::vector<CustomIntegratorUtilities::Comparison> comparisons;
+    std::vector<std::vector<Lepton::CompiledExpression> > globalExpressions;
+    CompiledExpressionSet expressionSet;
+    std::vector<bool> needsGlobals;
    std::vector<bool> needsForces;
    std::vector<bool> needsEnergy;
+    std::vector<bool> computeBothForceAndEnergy;
    std::vector<bool> invalidatesForces;
    std::vector<bool> merged;
-    std::vector<int> forceGroup;
+    std::vector<int> forceGroupFlags;
+    std::vector<int> blockEnd;
    std::vector<int> requiredGaussian;
    std::vector<int> requiredUniform;
+    std::vector<int> stepEnergyVariableIndex;
+    std::vector<int> globalVariableIndex;
+    std::vector<int> parameterVariableIndex;
+    int gaussianVariableIndex, uniformVariableIndex, dtVariableIndex;
    std::vector<std::string> parameterNames;
+    std::vector<GlobalTarget> stepTarget;
+};
+
+class CudaIntegrateCustomStepKernel::GlobalTarget {
+public:
+    CudaIntegrateCustomStepKernel::GlobalTargetType type;
+    int variableIndex;
+    GlobalTarget() {
+    }
+    GlobalTarget(CudaIntegrateCustomStepKernel::GlobalTargetType type, int variableIndex) : type(type), variableIndex(variableIndex) {
+    }
 };

 /**

--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
--- a/platforms/cuda/src/kernels/customIntegrator.cu
+++ b/platforms/cuda/src/kernels/customIntegrator.cu
@@ -11,7 +11,7 @@ extern "C" __global__ void computeFloatSum(const float* __restrict__ sumBuffer,
            tempBuffer[thread] += tempBuffer[thread+i];
    }
    if (thread == 0)
-        result[SUM_OUTPUT_INDEX] = tempBuffer[0];
+        *result = tempBuffer[0];
 }

 extern "C" __global__ void computeDoubleSum(const double* __restrict__ sumBuffer, double* result) {
@@ -27,7 +27,7 @@ extern "C" __global__ void computeDoubleSum(const double* __restrict__ sumBuffer
            tempBuffer[thread] += tempBuffer[thread+i];
    }
    if (thread == 0)
-        result[SUM_OUTPUT_INDEX] = tempBuffer[0];
+        *result = tempBuffer[0];
 }

 extern "C" __global__ void applyPositionDeltas(real4* __restrict__ posq, real4* __restrict__ posqCorrection, mixed4* __restrict__ posDelta) {

--- a/platforms/cuda/src/kernels/integrationUtilities.cu
+++ b/platforms/cuda/src/kernels/integrationUtilities.cu
@@ -224,7 +224,6 @@ extern "C" __global__ void applyShakeToVelocities(int numClusters, mixed tol, co
        mixed4 xpj2 = make_mixed4(0);
        float invMassCentral = params.x;
        float avgMass = params.y;
-        float d2 = params.z;
        float invMassPeripheral = params.w;
        if (atoms.z != -1) {
            pos2 = loadPos(oldPos, posCorrection, atoms.z);
@@ -245,9 +244,6 @@ extern "C" __global__ void applyShakeToVelocities(int numClusters, mixed tol, co
        mixed rij1sq = rij1.x*rij1.x + rij1.y*rij1.y + rij1.z*rij1.z;
        mixed rij2sq = rij2.x*rij2.x + rij2.y*rij2.y + rij2.z*rij2.z;
        mixed rij3sq = rij3.x*rij3.x + rij3.y*rij3.y + rij3.z*rij3.z;
-        mixed ld1 = d2-rij1sq;
-        mixed ld2 = d2-rij2sq;
-        mixed ld3 = d2-rij3sq;

        // Iterate until convergence.

@@ -605,8 +601,6 @@ extern "C" __global__ void computeCCMAVelocityConstraintForce(const int2* __rest
    if (threadIdx.x == 0)
        groupConverged = 1;
    __syncthreads();
-    mixed lowerTol = 1-2*tol+tol*tol;
-    mixed upperTol = 1+2*tol+tol*tol;
    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_CCMA_CONSTRAINTS; index += blockDim.x*gridDim.x) {
        // Compute the force due to this constraint.