Merge github.com:leeping/openmm

e3b25204 · leeping · 41e9a095 · 74415dd9 · e3b25204 · e3b25204
Commit e3b25204 authored Jul 15, 2013 by leeping
20 changed files
--- a/platforms/cuda/src/CudaExpressionUtilities.h
+++ b/platforms/cuda/src/CudaExpressionUtilities.h
--- a/platforms/cuda/src/CudaForceInfo.h
+++ b/platforms/cuda/src/CudaForceInfo.h
--- a/platforms/cuda/src/CudaIntegrationUtilities.h
+++ b/platforms/cuda/src/CudaIntegrationUtilities.h
--- a/platforms/cuda/src/CudaKernels.h
+++ b/platforms/cuda/src/CudaKernels.h
@@ -557,7 +557,7 @@ class CudaCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
 public:
    CudaCalcNonbondedForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) : CalcNonbondedForceKernel(name, platform),
            cu(cu), hasInitializedFFT(false), sigmaEpsilon(NULL), exceptionParams(NULL), cosSinSums(NULL), directPmeGrid(NULL), reciprocalPmeGrid(NULL),
-            pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL),  pmeAtomRange(NULL), pmeAtomGridIndex(NULL), sort(NULL) {
+            pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL),  pmeAtomRange(NULL), pmeAtomGridIndex(NULL), sort(NULL), pmeio(NULL) {
    }
    ~CudaCalcNonbondedForceKernel();
    /**
@@ -596,6 +596,9 @@ private:
        const char* getMaxValue() const {return "make_int2(INT_MAX, INT_MAX)";}
        const char* getSortKey() const {return "value.y";}
    };
+    class PmeIO;
+    class PmePreComputation;
+    class PmePostComputation;
    CudaContext& cu;
    bool hasInitializedFFT;
    CudaArray* sigmaEpsilon;
@@ -609,6 +612,8 @@ private:
    CudaArray* pmeAtomRange;
    CudaArray* pmeAtomGridIndex;
    CudaSort* sort;
+    Kernel cpuPme;
+    PmeIO* pmeio;
    cufftHandle fftForward;
    cufftHandle fftBackward;
    CUfunction ewaldSumsKernel;

--- a/platforms/cuda/src/CudaNonbondedUtilities.h
+++ b/platforms/cuda/src/CudaNonbondedUtilities.h
--- a/platforms/cuda/src/CudaParallelKernels.h
+++ b/platforms/cuda/src/CudaParallelKernels.h
--- a/platforms/cuda/src/CudaParameterSet.h
+++ b/platforms/cuda/src/CudaParameterSet.h
--- a/platforms/cuda/include/CudaPlatform.h
+++ b/platforms/cuda/include/CudaPlatform.h
@@ -81,6 +81,13 @@ public:
        static const std::string key = "CudaPrecision";
        return key;
    }
+    /**
+     * This is the name of the parameter for selecting whether to use the CPU based PME calculation.
+     */
+    static const std::string& CudaUseCpuPme() {
+        static const std::string key = "CudaUseCpuPme";
+        return key;
+    }
    /**
     * This is the name of the parameter for specifying the path to the CUDA compiler.
     */
@@ -99,14 +106,15 @@ public:

 class OPENMM_EXPORT_CUDA CudaPlatform::PlatformData {
 public:
-    PlatformData(const System& system, const std::string& deviceIndexProperty, const std::string& blockingProperty, const std::string& precisionProperty,
-            const std::string& compilerProperty, const std::string& tempProperty);
+    PlatformData(ContextImpl* context, const System& system, const std::string& deviceIndexProperty, const std::string& blockingProperty, const std::string& precisionProperty,
+            const std::string& cpuPmeProperty, const std::string& compilerProperty, const std::string& tempProperty);
    ~PlatformData();
    void initializeContexts(const System& system);
    void syncContexts();
+    ContextImpl* context;
    std::vector<CudaContext*> contexts;
    std::vector<double> contextEnergy;
-    bool removeCM, peerAccessSupported;
+    bool removeCM, peerAccessSupported, useCpuPme;
    int cmMotionFrequency;
    int stepCount, computeForceCount;
    double time;

--- a/platforms/cuda/src/CudaSort.h
+++ b/platforms/cuda/src/CudaSort.h
--- a/platforms/cuda/src/CudaContext.cpp
+++ b/platforms/cuda/src/CudaContext.cpp
@@ -231,6 +231,10 @@ CudaContext::~CudaContext() {
        delete forces[i];
    for (int i = 0; i < (int) reorderListeners.size(); i++)
        delete reorderListeners[i];
+    for (int i = 0; i < (int) preComputations.size(); i++)
+        delete preComputations[i];
+    for (int i = 0; i < (int) postComputations.size(); i++)
+        delete postComputations[i];
    if (pinnedBuffer != NULL)
        cuMemFreeHost(pinnedBuffer);
    if (posq != NULL)
@@ -743,7 +747,8 @@ void CudaContext::findMoleculeGroups() {
            for (int j = 0; j < forces[i]->getNumParticleGroups(); j++) {
                vector<int> particles;
                forces[i]->getParticlesInGroup(j, particles);
-                molecules[atomMolecule[particles[0]]].groups[i].push_back(j);
+                if (particles.size() > 0)
+                    molecules[atomMolecule[particles[0]]].groups[i].push_back(j);
            }
    }

@@ -1102,6 +1107,14 @@ void CudaContext::addReorderListener(ReorderListener* listener) {
    reorderListeners.push_back(listener);
 }

+void CudaContext::addPreComputation(ForcePreComputation* computation) {
+    preComputations.push_back(computation);
+}
+
+void CudaContext::addPostComputation(ForcePostComputation* computation) {
+    postComputations.push_back(computation);
+}
+
 struct CudaContext::WorkThread::ThreadData {
    ThreadData(std::queue<CudaContext::WorkTask*>& tasks, bool& waiting,  bool& finished,
            pthread_mutex_t& queueLock, pthread_cond_t& waitForTaskCondition, pthread_cond_t& queueEmptyCondition) :

--- a/platforms/cuda/src/CudaExpressionUtilities.cpp
+++ b/platforms/cuda/src/CudaExpressionUtilities.cpp
@@ -181,7 +181,7 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express
            out << "ASIN(" << getTempName(node.getChildren()[0], temps) << ")";
            break;
        case Operation::ACOS:
-            out << "ACSO(" << getTempName(node.getChildren()[0], temps) << ")";
+            out << "ACOS(" << getTempName(node.getChildren()[0], temps) << ")";
            break;
        case Operation::ATAN:
            out << "ATAN(" << getTempName(node.getChildren()[0], temps) << ")";

--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
--- a/platforms/cuda/src/CudaNonbondedUtilities.cpp
+++ b/platforms/cuda/src/CudaNonbondedUtilities.cpp
@@ -457,7 +457,9 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
    }
    replacements["LOAD_ATOM1_PARAMETERS"] = load1.str();

-    bool useShuffle = (context.getComputeCapability() >= 3.0);
+    int cudaVersion;
+    cuDriverGetVersion(&cudaVersion);
+    bool useShuffle = (context.getComputeCapability() >= 3.0 && cudaVersion >= 5050);

    // Part 1. Defines for on diagonal exclusion tiles
    stringstream loadLocal1;

--- a/platforms/cuda/src/CudaPlatform.cpp
+++ b/platforms/cuda/src/CudaPlatform.cpp
@@ -87,12 +87,14 @@ CudaPlatform::CudaPlatform() {
    platformProperties.push_back(CudaDeviceName());
    platformProperties.push_back(CudaUseBlockingSync());
    platformProperties.push_back(CudaPrecision());
+    platformProperties.push_back(CudaUseCpuPme());
    platformProperties.push_back(CudaCompiler());
    platformProperties.push_back(CudaTempDirectory());
    setPropertyDefaultValue(CudaDeviceIndex(), "");
    setPropertyDefaultValue(CudaDeviceName(), "");
    setPropertyDefaultValue(CudaUseBlockingSync(), "true");
    setPropertyDefaultValue(CudaPrecision(), "single");
+    setPropertyDefaultValue(CudaUseCpuPme(), "false");
 #ifdef _MSC_VER
    char* bindir = getenv("CUDA_BIN_PATH");
    string nvcc = (bindir == NULL ? "nvcc.exe" : string(bindir)+"\\nvcc.exe");
@@ -141,13 +143,20 @@ void CudaPlatform::contextCreated(ContextImpl& context, const map<string, string
            getPropertyDefaultValue(CudaUseBlockingSync()) : properties.find(CudaUseBlockingSync())->second);
    string precisionPropValue = (properties.find(CudaPrecision()) == properties.end() ?
            getPropertyDefaultValue(CudaPrecision()) : properties.find(CudaPrecision())->second);
+    string cpuPmePropValue = (properties.find(CudaUseCpuPme()) == properties.end() ?
+            getPropertyDefaultValue(CudaUseCpuPme()) : properties.find(CudaUseCpuPme())->second);
    const string& compilerPropValue = (properties.find(CudaCompiler()) == properties.end() ?
            getPropertyDefaultValue(CudaCompiler()) : properties.find(CudaCompiler())->second);
    const string& tempPropValue = (properties.find(CudaTempDirectory()) == properties.end() ?
            getPropertyDefaultValue(CudaTempDirectory()) : properties.find(CudaTempDirectory())->second);
    transform(blockingPropValue.begin(), blockingPropValue.end(), blockingPropValue.begin(), ::tolower);
    transform(precisionPropValue.begin(), precisionPropValue.end(), precisionPropValue.begin(), ::tolower);
-    context.setPlatformData(new PlatformData(context.getSystem(), devicePropValue, blockingPropValue, precisionPropValue, compilerPropValue, tempPropValue));
+    transform(cpuPmePropValue.begin(), cpuPmePropValue.end(), cpuPmePropValue.begin(), ::tolower);
+    vector<string> pmeKernelName;
+    pmeKernelName.push_back(CalcPmeReciprocalForceKernel::Name());
+    if (!supportsKernels(pmeKernelName))
+        cpuPmePropValue = "false";
+    context.setPlatformData(new PlatformData(&context, context.getSystem(), devicePropValue, blockingPropValue, precisionPropValue, cpuPmePropValue, compilerPropValue, tempPropValue));
 }

 void CudaPlatform::contextDestroyed(ContextImpl& context) const {
@@ -155,8 +164,8 @@ void CudaPlatform::contextDestroyed(ContextImpl& context) const {
    delete data;
 }

-CudaPlatform::PlatformData::PlatformData(const System& system, const string& deviceIndexProperty, const string& blockingProperty, const string& precisionProperty,
-            const string& compilerProperty, const string& tempProperty) : removeCM(false), stepCount(0), computeForceCount(0), time(0.0)  {
+CudaPlatform::PlatformData::PlatformData(ContextImpl* context, const System& system, const string& deviceIndexProperty, const string& blockingProperty, const string& precisionProperty,
+            const string& cpuPmeProperty, const string& compilerProperty, const string& tempProperty) : context(context), removeCM(false), stepCount(0), computeForceCount(0), time(0.0)  {
    bool blocking = (blockingProperty == "true");
    vector<string> devices;
    size_t searchPos = 0, nextPos;
@@ -185,10 +194,12 @@ CudaPlatform::PlatformData::PlatformData(const System& system, const string& dev
        CHECK_RESULT(cuDeviceGetName(name, 1000, contexts[i]->getDevice()), "Error querying device name");
        deviceName << name;
    }
+    useCpuPme = (cpuPmeProperty == "true" && !contexts[0]->getUseDoublePrecision());
    propertyValues[CudaPlatform::CudaDeviceIndex()] = deviceIndex.str();
    propertyValues[CudaPlatform::CudaDeviceName()] = deviceName.str();
    propertyValues[CudaPlatform::CudaUseBlockingSync()] = blocking ? "true" : "false";
    propertyValues[CudaPlatform::CudaPrecision()] = precisionProperty;
+    propertyValues[CudaPlatform::CudaUseCpuPme()] = useCpuPme ? "true" : "false";
    propertyValues[CudaPlatform::CudaCompiler()] = compilerProperty;
    propertyValues[CudaPlatform::CudaTempDirectory()] = tempProperty;
    contextEnergy.resize(contexts.size());

--- a/platforms/cuda/src/kernels/customIntegrator.cu
+++ b/platforms/cuda/src/kernels/customIntegrator.cu
@@ -52,10 +52,10 @@ extern "C" __global__ void applyPositionDeltas(real4* __restrict__ posq, real4*
    }
 }

-extern "C" __global__ void generateRandomNumbers(float4* __restrict__ random, uint4* __restrict__ seed) {
+extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restrict__ random, uint4* __restrict__ seed) {
    uint4 state = seed[blockIdx.x*blockDim.x+threadIdx.x];
    unsigned int carry = 0;
-    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
+    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numValues; index += blockDim.x*gridDim.x) {
        // Generate three uniform random numbers.

        state.x = state.x * 69069 + 1;

--- a/platforms/cuda/src/kernels/customIntegratorPerDof.cu
+++ b/platforms/cuda/src/kernels/customIntegratorPerDof.cu
@@ -34,11 +34,10 @@ inline __device__ mixed4 convertFromDouble4(double4 a) {
 extern "C" __global__ void computePerDof(real4* __restrict__ posq, real4* __restrict__ posqCorrection, mixed4* __restrict__ posDelta,
        mixed4* __restrict__ velm, const long long* __restrict__ force, const mixed2* __restrict__ dt, const mixed* __restrict__ globals,
        const mixed* __restrict__ params, mixed* __restrict__ sum, const float4* __restrict__ gaussianValues,
-        unsigned int randomIndex, const float4* __restrict__ uniformValues, const real* __restrict__ energy
+        unsigned int gaussianBaseIndex, const float4* __restrict__ uniformValues, const real* __restrict__ energy
        PARAMETER_ARGUMENTS) {
    mixed stepSize = dt[0].y;
    int index = blockIdx.x*blockDim.x+threadIdx.x;
-    randomIndex += index;
    const double forceScale = 1.0/0xFFFFFFFF;
    while (index < NUM_ATOMS) {
 #ifdef LOAD_POS_AS_DELTA
@@ -50,11 +49,10 @@ extern "C" __global__ void computePerDof(real4* __restrict__ posq, real4* __rest
        double4 f = make_double4(forceScale*force[index], forceScale*force[index+PADDED_NUM_ATOMS], forceScale*force[index+PADDED_NUM_ATOMS*2], 0.0);
        double mass = 1.0/velocity.w;
        if (velocity.w != 0.0) {
-            float4 gaussian = gaussianValues[randomIndex];
-            float4 uniform = uniformValues[index];
+            int gaussianIndex = gaussianBaseIndex;
+            int uniformIndex = 0;
            COMPUTE_STEP
        }
-        randomIndex += blockDim.x*gridDim.x;
        index += blockDim.x*gridDim.x;
    }
 }
--- a/platforms/cuda/src/kernels/integrationUtilities.cu
+++ b/platforms/cuda/src/kernels/integrationUtilities.cu
@@ -364,9 +364,9 @@ extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, co
        mixed yaksYd = zaksZd*xaksXd - xaksZd*zaksXd;
        mixed zaksYd = xaksZd*yaksXd - yaksZd*xaksXd;

-        mixed axlng = SQRT(xaksXd*xaksXd + yaksXd*yaksXd + zaksXd*zaksXd);
-        mixed aylng = SQRT(xaksYd*xaksYd + yaksYd*yaksYd + zaksYd*zaksYd);
-        mixed azlng = SQRT(xaksZd*xaksZd + yaksZd*yaksZd + zaksZd*zaksZd);
+        mixed axlng = sqrt(xaksXd*xaksXd + yaksXd*yaksXd + zaksXd*zaksXd);
+        mixed aylng = sqrt(xaksYd*xaksYd + yaksYd*yaksYd + zaksYd*zaksYd);
+        mixed azlng = sqrt(xaksZd*xaksZd + yaksZd*yaksZd + zaksZd*zaksZd);
        mixed trns11 = xaksXd / axlng;
        mixed trns21 = yaksXd / axlng;
        mixed trns31 = zaksXd / axlng;
@@ -392,13 +392,13 @@ extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, co
        //                                        --- Step2  A2' ---

        float rc = 0.5f*params.y;
-        mixed rb = SQRT(params.x*params.x-rc*rc);
+        mixed rb = sqrt(params.x*params.x-rc*rc);
        mixed ra = rb*(m1+m2)*invTotalMass;
        rb -= ra;
        mixed sinphi = za1d/ra;
-        mixed cosphi = SQRT(1-sinphi*sinphi);
+        mixed cosphi = sqrt(1-sinphi*sinphi);
        mixed sinpsi = (zb1d-zc1d) / (2*rc*cosphi);
-        mixed cospsi = SQRT(1-sinpsi*sinpsi);
+        mixed cospsi = sqrt(1-sinpsi*sinpsi);

        mixed ya2d =   ra*cosphi;
        mixed xb2d = - rc*cospsi;
@@ -406,7 +406,7 @@ extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, co
        mixed yc2d = - rb*cosphi + rc*sinpsi*sinphi;
        mixed xb2d2 = xb2d*xb2d;
        mixed hh2 = 4.0f*xb2d2 + (yb2d-yc2d)*(yb2d-yc2d) + (zb1d-zc1d)*(zb1d-zc1d);
-        mixed deltx = 2.0f*xb2d + SQRT(4.0f*xb2d2 - hh2 + params.y*params.y);
+        mixed deltx = 2.0f*xb2d + sqrt(4.0f*xb2d2 - hh2 + params.y*params.y);
        xb2d -= deltx*0.5f;

        //                                        --- Step3  al,be,ga ---
@@ -416,11 +416,11 @@ extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, co
        mixed gamma = xb0d*yb1d - xb1d*yb0d + xc0d*yc1d - xc1d*yc0d;

        mixed al2be2 = alpha*alpha + beta*beta;
-        mixed sintheta = (alpha*gamma - beta*SQRT(al2be2 - gamma*gamma)) / al2be2;
+        mixed sintheta = (alpha*gamma - beta*sqrt(al2be2 - gamma*gamma)) / al2be2;

        //                                        --- Step4  A3' ---

-        mixed costheta = SQRT(1-sintheta*sintheta);
+        mixed costheta = sqrt(1-sintheta*sintheta);
        mixed xa3d = - ya2d*sintheta;
        mixed ya3d =   ya2d*costheta;
        mixed za3d = za1d;

--- a/platforms/cuda/src/kernels/pme.cu
+++ b/platforms/cuda/src/kernels/pme.cu
@@ -266,8 +266,18 @@ void gridInterpolateForce(const real4* __restrict__ posq, unsigned long long* __
            }
        }
        real q = pos.w*EPSILON_FACTOR;
-        forceBuffers[atom] +=  static_cast<unsigned long long>((long long) (-q*force.x*GRID_SIZE_X*invPeriodicBoxSize.x*0x100000000));
-        forceBuffers[atom+PADDED_NUM_ATOMS] +=  static_cast<unsigned long long>((long long) (-q*force.y*GRID_SIZE_Y*invPeriodicBoxSize.y*0x100000000));
-        forceBuffers[atom+2*PADDED_NUM_ATOMS] +=  static_cast<unsigned long long>((long long) (-q*force.z*GRID_SIZE_Z*invPeriodicBoxSize.z*0x100000000));
+        forceBuffers[atom] += static_cast<unsigned long long>((long long) (-q*force.x*GRID_SIZE_X*invPeriodicBoxSize.x*0x100000000));
+        forceBuffers[atom+PADDED_NUM_ATOMS] += static_cast<unsigned long long>((long long) (-q*force.y*GRID_SIZE_Y*invPeriodicBoxSize.y*0x100000000));
+        forceBuffers[atom+2*PADDED_NUM_ATOMS] += static_cast<unsigned long long>((long long) (-q*force.z*GRID_SIZE_Z*invPeriodicBoxSize.z*0x100000000));
+    }
+}
+
+extern "C" __global__
+void addForces(const real4* __restrict__ forces, unsigned long long* __restrict__ forceBuffers) {
+    for (int atom = blockIdx.x*blockDim.x+threadIdx.x; atom < NUM_ATOMS; atom += blockDim.x*gridDim.x) {
+        real4 f = forces[atom];
+        forceBuffers[atom] += static_cast<unsigned long long>((long long) (f.x*0x100000000));
+        forceBuffers[atom+PADDED_NUM_ATOMS] += static_cast<unsigned long long>((long long) (f.y*0x100000000));
+        forceBuffers[atom+2*PADDED_NUM_ATOMS] += static_cast<unsigned long long>((long long) (f.z*0x100000000));
    }
 }
--- a/platforms/cuda/tests/TestCudaAndersenThermostat.cpp
+++ b/platforms/cuda/tests/TestCudaAndersenThermostat.cpp
@@ -40,7 +40,7 @@
 #include "openmm/NonbondedForce.h"
 #include "openmm/System.h"
 #include "openmm/VerletIntegrator.h"
-#include "../src/SimTKUtilities/SimTKOpenMMRealType.h"
+#include "SimTKOpenMMRealType.h"
 #include "sfmt/SFMT.h"
 #include <iostream>
 #include <vector>

--- a/platforms/cuda/tests/TestCudaBrownianIntegrator.cpp
+++ b/platforms/cuda/tests/TestCudaBrownianIntegrator.cpp
@@ -43,7 +43,7 @@
 #include "openmm/NonbondedForce.h"
 #include "openmm/System.h"
 #include "openmm/BrownianIntegrator.h"
-#include "../src/SimTKUtilities/SimTKOpenMMRealType.h"
+#include "SimTKOpenMMRealType.h"
 #include "sfmt/SFMT.h"
 #include <iostream>
 #include <vector>