CUDA implementation of RMSDForce

f4dc3110 · Peter Eastman · 2855d12e · f4dc3110 · f4dc3110 · f4dc3110
Commit f4dc3110 authored Jan 24, 2018 by Peter Eastman
6 changed files
--- a/platforms/cuda/include/CudaKernels.h
+++ b/platforms/cuda/include/CudaKernels.h
@@ -1278,6 +1278,59 @@ private:
    CUfunction copyStateKernel, copyForcesKernel, addForcesKernel;
 };
+/**
+ * This kernel is invoked by RMSDForce to calculate the forces acting on the system and the energy of the system.
+ */
+class CudaCalcRMSDForceKernel : public CalcRMSDForceKernel {
+public:
+    CudaCalcRMSDForceKernel(std::string name, const Platform& platform, CudaContext& cu) : CalcRMSDForceKernel(name, platform),
+            cu(cu), referencePos(NULL), particles(NULL), buffer(NULL) {
+    }
+    ~CudaCalcRMSDForceKernel();
+    /**
+     * Initialize the kernel.
+     *
+     * @param system     the System this kernel will be applied to
+     * @param force      the RMSDForce this kernel will be used for
+     */
+    void initialize(const System& system, const RMSDForce& force);
+    /**
+     * Record the reference positions and particle indices.
+     */
+    void recordParameters(const RMSDForce& force);
+    /**
+     * Execute the kernel to calculate the forces and/or energy.
+     *
+     * @param context        the context in which to execute this kernel
+     * @param includeForces  true if forces should be calculated
+     * @param includeEnergy  true if the energy should be calculated
+     * @return the potential energy due to the force
+     */
+    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
+    /**
+     * This is the internal implementation of execute(), templatized on whether we're
+     * using single or double precision.
+     */
+    template <class REAL>
+    double executeImpl(ContextImpl& context);
+    /**
+     * Copy changed parameters over to a context.
+     *
+     * @param context    the context to copy parameters to
+     * @param force      the RMSDForce to copy the parameters from
+     */
+    void copyParametersToContext(ContextImpl& context, const RMSDForce& force);
+private:
+    class ForceInfo;
+    CudaContext& cu;
+    ForceInfo* info;
+    double sumNormRef;
+    CudaArray* referencePos;
+    CudaArray* particles;
+    CudaArray* buffer;
+    CUfunction kernel1, kernel2;
+};
 /**
 * This kernel is invoked by VerletIntegrator to take one time step.
 */

--- a/platforms/cuda/src/CudaKernelFactory.cpp
+++ b/platforms/cuda/src/CudaKernelFactory.cpp
@@ -110,6 +110,8 @@ KernelImpl* CudaKernelFactory::createKernelImpl(std::string name, const Platform
        return new CudaCalcCustomCompoundBondForceKernel(name, platform, cu, context.getSystem());
    if (name == CalcCustomCVForceKernel::Name())
        return new CudaCalcCustomCVForceKernel(name, platform, cu);
+    if (name == CalcRMSDForceKernel::Name())
+        return new CudaCalcRMSDForceKernel(name, platform, cu);
    if (name == CalcCustomManyParticleForceKernel::Name())
        return new CudaCalcCustomManyParticleForceKernel(name, platform, cu, context.getSystem());
    if (name == CalcGayBerneForceKernel::Name())

--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
@@ -51,6 +51,7 @@
 #include "ReferenceTabulatedFunction.h"
 #include "SimTKOpenMMRealType.h"
 #include "SimTKOpenMMUtilities.h"
+#include "jama_eig.h"
 #include <algorithm>
 #include <cmath>
 #include <set>
@@ -6784,6 +6785,200 @@ void CudaCalcCustomCVForceKernel::copyState(ContextImpl& context, ContextImpl& i
        innerContext.setParameter(param.first, context.getParameter(param.first));
 }
+class CudaCalcRMSDForceKernel::ForceInfo : public CudaForceInfo {
+public:
+    ForceInfo(const RMSDForce& force) : force(force) {
+        updateParticles();
+    }
+    void updateParticles() {
+        particles.clear();
+        for (int i : force.getParticles())
+            particles.insert(i);
+    }
+    bool areParticlesIdentical(int particle1, int particle2) {
+        bool include1 = (particles.find(particle1) != particles.end());
+        bool include2 = (particles.find(particle2) != particles.end());
+        return (include1 == include2);
+    }
+private:
+    const RMSDForce& force;
+    set<int> particles;
+};
+CudaCalcRMSDForceKernel::~CudaCalcRMSDForceKernel() {
+    if (referencePos != NULL)
+        delete referencePos;
+    if (particles != NULL)
+        delete particles;
+    if (buffer != NULL)
+        delete buffer;
+}
+void CudaCalcRMSDForceKernel::initialize(const System& system, const RMSDForce& force) {
+    // Create data structures.
+    bool useDouble = cu.getUseDoublePrecision();
+    int elementSize = (useDouble ? sizeof(double) : sizeof(float));
+    int numParticles = force.getParticles().size();
+    if (numParticles == 0)
+        numParticles = system.getNumParticles();
+    referencePos = new CudaArray(cu, system.getNumParticles(), 4*elementSize, "referencePos");
+    particles = CudaArray::create<int>(cu, numParticles, "particles");
+    buffer = new CudaArray(cu, 13, elementSize, "buffer");
+    recordParameters(force);
+    info = new ForceInfo(force);
+    cu.addForce(info);
+    // Create the kernels.
+    CUmodule module = cu.createModule(CudaKernelSources::vectorOps+CudaKernelSources::rmsd);
+    kernel1 = cu.getKernel(module, "computeRMSDPart1");
+    kernel2 = cu.getKernel(module, "computeRMSDForces");
+}
+void CudaCalcRMSDForceKernel::recordParameters(const RMSDForce& force) {
+    // Record the parameters and center the reference positions.
+    vector<int> particleVec = force.getParticles();
+    if (particleVec.size() == 0)
+        for (int i = 0; i < cu.getNumAtoms(); i++)
+            particleVec.push_back(i);
+    vector<Vec3> centeredPositions = force.getReferencePositions();
+    Vec3 center;
+    for (int i : particleVec)
+        center += centeredPositions[i];
+    center /= particleVec.size();
+    for (Vec3& p : centeredPositions)
+        p -= center;
+    // Upload them to the device.
+    particles->upload(particleVec);
+    if (cu.getUseDoublePrecision()) {
+        vector<double4> pos;
+        for (Vec3 p : centeredPositions)
+            pos.push_back(make_double4(p[0], p[1], p[2], 0));
+        referencePos->upload(pos);
+    }
+    else {
+        vector<float4> pos;
+        for (Vec3 p : centeredPositions)
+            pos.push_back(make_float4(p[0], p[1], p[2], 0));
+        referencePos->upload(pos);
+    }
+    // Record the sum of the norms of the reference positions.
+    sumNormRef = 0.0;
+    for (int i : particleVec) {
+        Vec3 p = centeredPositions[i];
+        sumNormRef += p.dot(p);
+    }
+}
+double CudaCalcRMSDForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
+    if (cu.getUseDoublePrecision())
+        return executeImpl<double>(context);
+    return executeImpl<float>(context);
+}
+template <class REAL>
+double CudaCalcRMSDForceKernel::executeImpl(ContextImpl& context) {
+    // Execute the first kernel.
+    int numParticles = particles->getSize();
+    int blockSize = 128;
+    void* args1[] = {&numParticles, &cu.getPosq().getDevicePointer(), &referencePos->getDevicePointer(),
+            &particles->getDevicePointer(), &buffer->getDevicePointer()};
+    cu.executeKernel(kernel1, args1, blockSize, blockSize, blockSize*sizeof(REAL));
+    // Download the results, build the F matrix, and find the maximum eigenvalue
+    // and eigenvector.
+    vector<REAL> b;
+    buffer->download(b);
+    Array2D<double> F(4, 4);
+    F[0][0] =  b[0*3+0] + b[1*3+1] + b[2*3+2];
+    F[1][0] =  b[1*3+2] - b[2*3+1];
+    F[2][0] =  b[2*3+0] - b[0*3+2];
+    F[3][0] =  b[0*3+1] - b[1*3+0];
+    F[0][1] =  b[1*3+2] - b[2*3+1];
+    F[1][1] =  b[0*3+0] - b[1*3+1] - b[2*3+2];
+    F[2][1] =  b[0*3+1] + b[1*3+0];
+    F[3][1] =  b[0*3+2] + b[2*3+0];
+    F[0][2] =  b[2*3+0] - b[0*3+2];
+    F[1][2] =  b[0*3+1] + b[1*3+0];
+    F[2][2] = -b[0*3+0] + b[1*3+1] - b[2*3+2];
+    F[3][2] =  b[1*3+2] + b[2*3+1];
+    F[0][3] =  b[0*3+1] - b[1*3+0];
+    F[1][3] =  b[0*3+2] + b[2*3+0];
+    F[2][3] =  b[1*3+2] + b[2*3+1];
+    F[3][3] = -b[0*3+0] - b[1*3+1] + b[2*3+2];
+    JAMA::Eigenvalue<double> eigen(F);
+    Array1D<double> values;
+    eigen.getRealEigenvalues(values);
+    Array2D<double> vectors;
+    eigen.getV(vectors);
+    // Compute the RMSD.
+    double msd = (sumNormRef+b[9]-2*values[3])/numParticles;
+    if (msd < 1e-20) {
+        // The particles are perfectly aligned, so all the forces should be zero.
+        // Numerical error can lead to NaNs, so just return 0 now.
+        return 0.0;
+    }
+    double rmsd = sqrt(msd);
+    b[9] = rmsd;
+    // Compute the rotation matrix.
+    double q[] = {vectors[0][3], vectors[1][3], vectors[2][3], vectors[3][3]};
+    double q00 = q[0]*q[0], q01 = q[0]*q[1], q02 = q[0]*q[2], q03 = q[0]*q[3];
+    double q11 = q[1]*q[1], q12 = q[1]*q[2], q13 = q[1]*q[3];
+    double q22 = q[2]*q[2], q23 = q[2]*q[3];
+    double q33 = q[3]*q[3];
+    b[0] = q00+q11-q22-q33;
+    b[1] = 2*(q12-q03);
+    b[2] = 2*(q13+q02);
+    b[3] = 2*(q12+q03);
+    b[4] = q00-q11+q22-q33;
+    b[5] = 2*(q23-q01);
+    b[6] = 2*(q13-q02);
+    b[7] = 2*(q23+q01);
+    b[8] = q00-q11-q22+q33;
+    // Upload it to the device and invoke the kernel to apply forces.
+    buffer->upload(b);
+    int paddedNumAtoms = cu.getPaddedNumAtoms();
+    void* args2[] = {&numParticles, &paddedNumAtoms, &cu.getPosq().getDevicePointer(), &referencePos->getDevicePointer(),
+            &particles->getDevicePointer(), &buffer->getDevicePointer(), &cu.getForce().getDevicePointer()};
+    cu.executeKernel(kernel2, args2, numParticles);
+    return rmsd;
+}
+void CudaCalcRMSDForceKernel::copyParametersToContext(ContextImpl& context, const RMSDForce& force) {
+    if (referencePos->getSize() != force.getReferencePositions().size())
+        throw OpenMMException("updateParametersInContext: The number of reference positions has changed");
+    int numParticles = force.getParticles().size();
+    if (numParticles == 0)
+        numParticles = context.getSystem().getNumParticles();
+    if (numParticles != particles->getSize()) {
+        // Recreate the particles array.
+        delete particles;
+        particles = NULL;
+        particles = CudaArray::create<int>(cu, numParticles, "particles");
+    }
+    recordParameters(force);
+    // Mark that the current reordering may be invalid.
+    info->updateParticles();
+    cu.invalidateMolecules(info);
+}
 CudaIntegrateVerletStepKernel::~CudaIntegrateVerletStepKernel() {
 }

--- a/platforms/cuda/src/CudaPlatform.cpp
+++ b/platforms/cuda/src/CudaPlatform.cpp
@@ -92,6 +92,7 @@ CudaPlatform::CudaPlatform() {
    registerKernelFactory(CalcCustomCentroidBondForceKernel::Name(), factory);
    registerKernelFactory(CalcCustomCompoundBondForceKernel::Name(), factory);
    registerKernelFactory(CalcCustomCVForceKernel::Name(), factory);
+    registerKernelFactory(CalcRMSDForceKernel::Name(), factory);
    registerKernelFactory(CalcCustomManyParticleForceKernel::Name(), factory);
    registerKernelFactory(CalcGayBerneForceKernel::Name(), factory);
    registerKernelFactory(IntegrateVerletStepKernel::Name(), factory);

--- a/platforms/cuda/src/kernels/rmsd.cu
+++ b/platforms/cuda/src/kernels/rmsd.cu
+// This file contains kernels to compute the RMSD and its gradient using the algorithm described
+// in Coutsias et al, "Using quaternions to calculate RMSD" (doi: 10.1002/jcc.20110).
+/**
+ * Sum a value over all threads.
+ */
+__device__ real reduceValue(real value, real* temp) {
+    const int thread = threadIdx.x;
+    temp[thread] = value;
+    __syncthreads();
+    for (uint step = 1; step < blockDim.x; step *= 2) {
+        if (thread+step < blockDim.x && thread%(2*step) == 0)
+            temp[thread] = temp[thread] + temp[thread+step];
+        __syncthreads();
+    }
+    return temp[0];
+}
+/**
+ * Perform the first step of computing the RMSD.  This is executed as a single work group.
+ */
+extern "C" __global__ void computeRMSDPart1(int numParticles, const real4* __restrict__ posq, const real4* __restrict__ referencePos,
+         const int* __restrict__ particles, real* buffer) {
+    extern __shared__ real temp[];
+    // Compute the center of the particle positions.
+    real3 center = make_real3(0);
+    for (int i = threadIdx.x; i < numParticles; i += blockDim.x)
+        center += trimTo3(posq[particles[i]]);
+    center.x = reduceValue(center.x, temp)/numParticles;
+    center.y = reduceValue(center.y, temp)/numParticles;
+    center.z = reduceValue(center.z, temp)/numParticles;
+    // Compute the correlation matrix.
+    real R[3][3] = {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}};
+    real sum = 0;
+    for (int i = threadIdx.x; i < numParticles; i += blockDim.x) {
+        int index = particles[i];
+        real3 pos = trimTo3(posq[index]) - center;
+        real3 refPos = trimTo3(referencePos[index]);
+        R[0][0] += pos.x*refPos.x;
+        R[0][1] += pos.x*refPos.y;
+        R[0][2] += pos.x*refPos.z;
+        R[1][0] += pos.y*refPos.x;
+        R[1][1] += pos.y*refPos.y;
+        R[1][2] += pos.y*refPos.z;
+        R[2][0] += pos.z*refPos.x;
+        R[2][1] += pos.z*refPos.y;
+        R[2][2] += pos.z*refPos.z;
+        sum += dot(pos, pos);
+    }
+    for (int i = 0; i < 3; i++)
+        for (int j = 0; j < 3; j++)
+            R[i][j] = reduceValue(R[i][j], temp);
+    sum = reduceValue(sum, temp);
+    // Copy everything into the output buffer to send back to the host.
+    if (threadIdx.x == 0) {
+        for (int i = 0; i < 3; i++)
+            for (int j = 0; j < 3; j++)
+                buffer[3*i+j] = R[i][j];
+        buffer[9] = sum;
+        buffer[10] = center.x;
+        buffer[11] = center.y;
+        buffer[12] = center.z;
+    }
+}
+/**
+ * Apply forces based on the RMSD.
+ */
+extern "C" __global__ void computeRMSDForces(int numParticles, int paddedNumAtoms, const real4* __restrict__ posq, const real4* __restrict__ referencePos,
+         const int* __restrict__ particles, const real* buffer, unsigned long long* __restrict__ forceBuffers) {
+    real3 center = make_real3(buffer[10], buffer[11], buffer[12]);
+    real scale = 1 / (real) (buffer[9]*numParticles);
+    for (int i = blockDim.x*blockIdx.x+threadIdx.x; i < numParticles; i += blockDim.x*gridDim.x) {
+        int index = particles[i];
+        real3 pos = trimTo3(posq[index]) - center;
+        real3 refPos = trimTo3(referencePos[index]);
+        real3 rotatedRef = make_real3(buffer[0]*refPos.x + buffer[3]*refPos.y + buffer[6]*refPos.z,
+                                      buffer[1]*refPos.x + buffer[4]*refPos.y + buffer[7]*refPos.z,
+                                      buffer[2]*refPos.x + buffer[5]*refPos.y + buffer[8]*refPos.z);
+        real3 force = (rotatedRef-pos)*scale;
+        atomicAdd(&forceBuffers[index], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
+        atomicAdd(&forceBuffers[index+paddedNumAtoms], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
+        atomicAdd(&forceBuffers[index+2*paddedNumAtoms], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
+    }
+}
--- a/platforms/cuda/tests/TestCudaRMSDForce.cpp
+++ b/platforms/cuda/tests/TestCudaRMSDForce.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2018 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+#include "CudaTests.h"
+#include "TestRMSDForce.h"
+void runPlatformTests() {
+}