Created CUDA implementation of ring polymer contraction

863447a6 · peastman · 2e0ebb67 · 863447a6 · 863447a6 · 863447a6
Commit 863447a6 authored May 30, 2013 by peastman
4 changed files
--- a/plugins/rpmd/platforms/cuda/src/CudaRpmdKernels.cpp
+++ b/plugins/rpmd/platforms/cuda/src/CudaRpmdKernels.cpp
@@ -69,6 +69,10 @@ CudaIntegrateRPMDStepKernel::~CudaIntegrateRPMDStepKernel() {
        delete positions;
    if (velocities != NULL)
        delete velocities;
+    if (contractedForces != NULL)
+        delete contractedForces;
+    if (contractedPositions != NULL)
+        delete contractedPositions;
 }
 void CudaIntegrateRPMDStepKernel::initialize(const System& system, const RPMDIntegrator& integrator) {
@@ -106,6 +110,34 @@ void CudaIntegrateRPMDStepKernel::initialize(const System& system, const RPMDInt
            temp[i] = make_float4(0, 0, 0, 1);
        velocities->upload(temp);
    }
+    // Build a list of contractions.
+    groupsNotContracted = -1;
+    const map<int, int>& contractions = integrator.getContractions();
+    int maxContractedCopies = 0;
+    for (map<int, int>::const_iterator iter = contractions.begin(); iter != contractions.end(); ++iter) {
+        int group = iter->first;
+        int copies = iter->second;
+        if (group < 0 || group > 31)
+            throw OpenMMException("RPMDIntegrator: Force group must be between 0 and 31");
+        if (copies < 0 || copies > numCopies)
+            throw OpenMMException("RPMDIntegrator: Number of copies for contraction cannot be greater than the total number of copies being simulated");
+        if (copies != numCopies) {
+            if (groupsByCopies.find(copies) == groupsByCopies.end()) {
+                groupsByCopies[copies] = 1<<group;
+                groupsNotContracted -= 1<<group;
+                if (copies > maxContractedCopies)
+                    maxContractedCopies = copies;
+            }
+            else
+                groupsByCopies[copies] |= 1<<group;
+        }
+    }
+    if (maxContractedCopies > 0) {
+        contractedForces = CudaArray::create<long long>(cu, maxContractedCopies*paddedParticles*3, "rpmdContractedForces");
+        contractedPositions = new CudaArray(cu, maxContractedCopies*paddedParticles, elementSize, "rpmdContractedPositions");
+    }
    // Create kernels.
@@ -129,6 +161,23 @@ void CudaIntegrateRPMDStepKernel::initialize(const System& system, const RPMDInt
    copyToContextKernel = cu.getKernel(module, "copyDataToContext");
    copyFromContextKernel = cu.getKernel(module, "copyDataFromContext");
    translateKernel = cu.getKernel(module, "applyCellTranslations");
+    // Create kernels for doing contractions.
+    for (map<int, int>::const_iterator iter = groupsByCopies.begin(); iter != groupsByCopies.end(); ++iter) {
+        int copies = iter->first;
+        replacements.clear();
+        replacements["NUM_CONTRACTED_COPIES"] = cu.intToString(copies);
+        replacements["POS_SCALE"] = cu.doubleToString(1.0/numCopies);
+        replacements["FORCE_SCALE"] = cu.doubleToString(1.0/copies);
+        replacements["FFT_Q_FORWARD"] = createFFT(numCopies, "q", true);
+        replacements["FFT_Q_BACKWARD"] = createFFT(copies, "q", false);
+        replacements["FFT_F_FORWARD"] = createFFT(copies, "f", true);
+        replacements["FFT_F_BACKWARD"] = createFFT(numCopies, "f", false);
+        module = cu.createModule(cu.replaceStrings(CudaKernelSources::vectorOps+CudaRpmdKernelSources::rpmdContraction, replacements), defines, "");
+        positionContractionKernels[copies] = cu.getKernel(module, "contractPositions");
+        forceContractionKernels[copies] = cu.getKernel(module, "contractForces");
+    }
 }
 void CudaIntegrateRPMDStepKernel::execute(ContextImpl& context, const RPMDIntegrator& integrator, bool forcesAreValid) {
@@ -191,17 +240,49 @@ void CudaIntegrateRPMDStepKernel::execute(ContextImpl& context, const RPMDIntegr
 }
 void CudaIntegrateRPMDStepKernel::computeForces(ContextImpl& context) {
+    // Compute forces from all groups that didn't have a specified contraction.
    for (int i = 0; i < numCopies; i++) {
        void* copyToContextArgs[] = {&velocities->getDevicePointer(), &cu.getVelm().getDevicePointer(), &positions->getDevicePointer(),
                &cu.getPosq().getDevicePointer(), &cu.getAtomIndexArray().getDevicePointer(), &i};
        cu.executeKernel(copyToContextKernel, copyToContextArgs, cu.getNumAtoms());
        context.computeVirtualSites();
        context.updateContextState();
-        context.calcForcesAndEnergy(true, false);
+        context.calcForcesAndEnergy(true, false, groupsNotContracted);
        void* copyFromContextArgs[] = {&cu.getForce().getDevicePointer(), &forces->getDevicePointer(), &cu.getVelm().getDevicePointer(),
                &velocities->getDevicePointer(), &cu.getPosq().getDevicePointer(), &positions->getDevicePointer(), &cu.getAtomIndexArray().getDevicePointer(), &i};
        cu.executeKernel(copyFromContextKernel, copyFromContextArgs, cu.getNumAtoms());
    }
+    // Now loop over contractions and compute forces from them.
+    for (map<int, int>::const_iterator iter = groupsByCopies.begin(); iter != groupsByCopies.end(); ++iter) {
+        int copies = iter->first;
+        int groupFlags = iter->second;
+        // Find the contracted positions.
+        void* contractPosArgs[] = {&positions->getDevicePointer(), &contractedPositions->getDevicePointer()};
+        cu.executeKernel(positionContractionKernels[copies], contractPosArgs, numParticles*numCopies, workgroupSize);
+        // Compute forces.
+        for (int i = 0; i < copies; i++) {
+            void* copyToContextArgs[] = {&velocities->getDevicePointer(), &cu.getVelm().getDevicePointer(), &contractedPositions->getDevicePointer(),
+                    &cu.getPosq().getDevicePointer(), &cu.getAtomIndexArray().getDevicePointer(), &i};
+            cu.executeKernel(copyToContextKernel, copyToContextArgs, cu.getNumAtoms());
+            context.computeVirtualSites();
+            context.calcForcesAndEnergy(true, false, groupFlags);
+            void* copyFromContextArgs[] = {&cu.getForce().getDevicePointer(), &contractedForces->getDevicePointer(), &cu.getVelm().getDevicePointer(),
+                   &velocities->getDevicePointer(), &cu.getPosq().getDevicePointer(), &contractedPositions->getDevicePointer(), &cu.getAtomIndexArray().getDevicePointer(), &i};
+            cu.executeKernel(copyFromContextKernel, copyFromContextArgs, cu.getNumAtoms());
+        }
+        // Apply the forces to the original copies.
+        void* contractForceArgs[] = {&forces->getDevicePointer(), &contractedForces->getDevicePointer()};
+        cu.executeKernel(forceContractionKernels[copies], contractForceArgs, numParticles*numCopies, workgroupSize);
+    }
 }
 double CudaIntegrateRPMDStepKernel::computeKineticEnergy(ContextImpl& context, const RPMDIntegrator& integrator) {

--- a/plugins/rpmd/platforms/cuda/src/CudaRpmdKernels.h
+++ b/plugins/rpmd/platforms/cuda/src/CudaRpmdKernels.h
@@ -35,6 +35,7 @@
 #include "openmm/RpmdKernels.h"
 #include "CudaContext.h"
 #include "CudaArray.h"
+#include <map>
 namespace OpenMM {
@@ -45,7 +46,7 @@ namespace OpenMM {
 class CudaIntegrateRPMDStepKernel : public IntegrateRPMDStepKernel {
 public:
    CudaIntegrateRPMDStepKernel(std::string name, const Platform& platform, CudaContext& cu) :
-            IntegrateRPMDStepKernel(name, platform), cu(cu), forces(NULL), positions(NULL), velocities(NULL) {
+            IntegrateRPMDStepKernel(name, platform), cu(cu), forces(NULL), positions(NULL), velocities(NULL), contractedForces(NULL), contractedPositions(NULL) {
    }
    ~CudaIntegrateRPMDStepKernel();
    /**
@@ -88,10 +89,16 @@ private:
    std::string createFFT(int size, const std::string& variable, bool forward);
    CudaContext& cu;
    int numCopies, numParticles, workgroupSize;
+    std::map<int, int> groupsByCopies;
+    int groupsNotContracted;
    CudaArray* forces;
    CudaArray* positions;
    CudaArray* velocities;
+    CudaArray* contractedForces;
+    CudaArray* contractedPositions;
    CUfunction pileKernel, stepKernel, velocitiesKernel, copyToContextKernel, copyFromContextKernel, translateKernel;
+    std::map<int, CUfunction> positionContractionKernels;
+    std::map<int, CUfunction> forceContractionKernels;
 };
 } // namespace OpenMM

--- a/plugins/rpmd/platforms/cuda/src/kernels/rpmdContraction.cu
+++ b/plugins/rpmd/platforms/cuda/src/kernels/rpmdContraction.cu
+__device__ mixed3 multiplyComplexRealPart(mixed2 c1, mixed3 c2r, mixed3 c2i) {
+    return c1.x*c2r-c1.y*c2i;
+}
+__device__ mixed3 multiplyComplexImagPart(mixed2 c1, mixed3 c2r, mixed3 c2i) {
+    return c1.x*c2i+c1.y*c2r;
+}
+__device__ mixed3 multiplyComplexRealPartConj(mixed2 c1, mixed3 c2r, mixed3 c2i) {
+    return c1.x*c2r+c1.y*c2i;
+}
+__device__ mixed3 multiplyComplexImagPartConj(mixed2 c1, mixed3 c2r, mixed3 c2i) {
+    return c1.x*c2i-c1.y*c2r;
+}
+/**
+ * Compute the contracted positions
+ */
+extern "C" __global__ void contractPositions(mixed4* posq, mixed4* contracted) {
+    const int numBlocks = (blockDim.x*gridDim.x)/NUM_COPIES;
+    const int blockStart = NUM_COPIES*(threadIdx.x/NUM_COPIES);
+    const int indexInBlock = threadIdx.x-blockStart;
+    __shared__ mixed3 q[2*THREAD_BLOCK_SIZE];
+    __shared__ mixed3 temp[2*THREAD_BLOCK_SIZE];
+    __shared__ mixed2 w[NUM_COPIES];
+    mixed3* qreal = &q[blockStart];
+    mixed3* qimag = &q[blockStart+blockDim.x];
+    mixed3* tempreal = &temp[blockStart];
+    mixed3* tempimag = &temp[blockStart+blockDim.x];
+    if (threadIdx.x < NUM_COPIES)
+        w[indexInBlock] = make_mixed2(cos(-indexInBlock*2*M_PI/NUM_COPIES), sin(-indexInBlock*2*M_PI/NUM_COPIES));
+    __syncthreads();
+    for (int particle = (blockIdx.x*blockDim.x+threadIdx.x)/NUM_COPIES; particle < NUM_ATOMS; particle += numBlocks) {
+        // Load the particle position.
+        mixed4 particlePosq = posq[particle+indexInBlock*PADDED_NUM_ATOMS];
+        qreal[indexInBlock] = make_mixed3(particlePosq.x, particlePosq.y, particlePosq.z);
+        qimag[indexInBlock] = make_mixed3(0);
+        // Forward FFT.
+        __syncthreads();
+        FFT_Q_FORWARD
+        if (NUM_CONTRACTED_COPIES > 1) {
+            // Compress the data to remove high frequencies.
+            int start = (NUM_CONTRACTED_COPIES+1)/2;
+            tempreal[indexInBlock] = qreal[indexInBlock];
+            tempimag[indexInBlock] = qimag[indexInBlock];
+            __syncthreads();
+            if (indexInBlock < NUM_CONTRACTED_COPIES) {
+                qreal[indexInBlock] = tempreal[indexInBlock < start ? indexInBlock : indexInBlock+(NUM_COPIES-NUM_CONTRACTED_COPIES)];
+                qimag[indexInBlock] = tempimag[indexInBlock < start ? indexInBlock : indexInBlock+(NUM_COPIES-NUM_CONTRACTED_COPIES)];
+            }
+            __syncthreads();
+            FFT_Q_BACKWARD
+        }
+        // Store results.
+        if (indexInBlock < NUM_CONTRACTED_COPIES)
+            contracted[particle+indexInBlock*PADDED_NUM_ATOMS] = make_mixed4(POS_SCALE*qreal[indexInBlock].x, POS_SCALE*qreal[indexInBlock].y, POS_SCALE*qreal[indexInBlock].z, particlePosq.w);
+    }
+}
+/**
+ * Apply the contracted forces to all copies.
+ */
+extern "C" __global__ void contractForces(long long* force, long long* contracted) {
+    const int numBlocks = (blockDim.x*gridDim.x)/NUM_COPIES;
+    const int blockStart = NUM_COPIES*(threadIdx.x/NUM_COPIES);
+    const int indexInBlock = threadIdx.x-blockStart;
+    const mixed forceScale = 1/(mixed) 0x100000000;
+    __shared__ mixed3 f[2*THREAD_BLOCK_SIZE];
+    __shared__ mixed3 temp[2*THREAD_BLOCK_SIZE];
+    __shared__ mixed2 w[NUM_COPIES];
+    mixed3* freal = &f[blockStart];
+    mixed3* fimag = &f[blockStart+blockDim.x];
+    mixed3* tempreal = &temp[blockStart];
+    mixed3* tempimag = &temp[blockStart+blockDim.x];
+    if (threadIdx.x < NUM_COPIES)
+        w[indexInBlock] = make_mixed2(cos(-indexInBlock*2*M_PI/NUM_COPIES), sin(-indexInBlock*2*M_PI/NUM_COPIES));
+    __syncthreads();
+    for (int particle = (blockIdx.x*blockDim.x+threadIdx.x)/NUM_COPIES; particle < NUM_ATOMS; particle += numBlocks) {
+        // Load the force.
+        int forceIndex = particle+indexInBlock*PADDED_NUM_ATOMS*3;
+        if (indexInBlock < NUM_CONTRACTED_COPIES) {
+            freal[indexInBlock] = make_mixed3(contracted[forceIndex], contracted[forceIndex+PADDED_NUM_ATOMS], contracted[forceIndex+PADDED_NUM_ATOMS*2]);
+            fimag[indexInBlock] = make_mixed3(0);
+        }
+        __syncthreads();
+        // Forward FFT.
+        if (NUM_CONTRACTED_COPIES > 1) {
+            FFT_F_FORWARD
+        }
+        // Set the high frequency components to 0.
+        int start = (NUM_CONTRACTED_COPIES+1)/2;
+        int end = NUM_COPIES-NUM_CONTRACTED_COPIES+start;
+        tempreal[indexInBlock] = freal[indexInBlock];
+        tempimag[indexInBlock] = fimag[indexInBlock];
+        __syncthreads();
+        if (indexInBlock >= start) {
+            freal[indexInBlock] = (indexInBlock < end ? make_mixed3(0) : tempreal[indexInBlock-(NUM_COPIES-NUM_CONTRACTED_COPIES)]);
+            fimag[indexInBlock] = (indexInBlock < end ? make_mixed3(0) : tempimag[indexInBlock-(NUM_COPIES-NUM_CONTRACTED_COPIES)]);
+        }
+        __syncthreads();
+        FFT_F_BACKWARD
+        // Store results.
+        force[forceIndex] = FORCE_SCALE*freal[indexInBlock].x;
+        force[forceIndex+PADDED_NUM_ATOMS] = FORCE_SCALE*freal[indexInBlock].y;
+        force[forceIndex+PADDED_NUM_ATOMS*2] = FORCE_SCALE*freal[indexInBlock].z;
+    }
+}
--- a/plugins/rpmd/platforms/cuda/tests/TestCudaRpmd.cpp
+++ b/plugins/rpmd/platforms/cuda/tests/TestCudaRpmd.cpp
@@ -355,6 +355,82 @@ void testVirtualSites() {
    ASSERT_USUALLY_EQUAL_TOL(expectedKE, meanKE, 1e-2);
 }
+void testContractions() {
+    const int gridSize = 3;
+    const int numMolecules = gridSize*gridSize*gridSize;
+    const int numParticles = numMolecules*2;
+    const int numCopies = 10;
+    const double spacing = 2.0;
+    const double cutoff = 3.0;
+    const double boxSize = spacing*(gridSize+1);
+    const double temperature = 300.0;
+    System system;
+    system.setDefaultPeriodicBoxVectors(Vec3(boxSize, 0, 0), Vec3(0, boxSize, 0), Vec3(0, 0, boxSize));
+    HarmonicBondForce* bonds = new HarmonicBondForce();
+    system.addForce(bonds);
+    NonbondedForce* nonbonded = new NonbondedForce();
+    nonbonded->setCutoffDistance(cutoff);
+    nonbonded->setNonbondedMethod(NonbondedForce::PME);
+    nonbonded->setForceGroup(1);
+    nonbonded->setReciprocalSpaceForceGroup(2);
+    system.addForce(nonbonded);
+    // Create a cloud of molecules.
+    OpenMM_SFMT::SFMT sfmt;
+    init_gen_rand(0, sfmt);
+    vector<Vec3> positions(numParticles);
+    for (int i = 0; i < numMolecules; i++) {
+        system.addParticle(1.0);
+        system.addParticle(1.0);
+        nonbonded->addParticle(-0.2, 0.2, 0.2);
+        nonbonded->addParticle(0.2, 0.2, 0.2);
+        nonbonded->addException(2*i, 2*i+1, 0, 1, 0);
+        bonds->addBond(2*i, 2*i+1, 1.0, 10000.0);
+    }
+    map<int, int> contractions;
+    contractions[1] = 3;
+    contractions[2] = 1;
+    RPMDIntegrator integ(numCopies, temperature, 10.0, 0.001, contractions);
+    Platform& platform = Platform::getPlatformByName("CUDA");
+    Context context(system, integ, platform);
+    for (int copy = 0; copy < numCopies; copy++) {
+        for (int i = 0; i < gridSize; i++)
+            for (int j = 0; j < gridSize; j++)
+                for (int k = 0; k < gridSize; k++) {
+                    Vec3 pos = Vec3(spacing*(i+0.02*genrand_real2(sfmt)), spacing*(j+0.02*genrand_real2(sfmt)), spacing*(k+0.02*genrand_real2(sfmt)));
+                    int index = k+gridSize*(j+gridSize*i);
+                    positions[2*index] = pos;
+                    positions[2*index+1] = Vec3(pos[0]+1.0, pos[1], pos[2]);
+                }
+        integ.setPositions(copy, positions);
+    }
+    // Check the temperature.
+    const int numSteps = 1000;
+    integ.step(1000);
+    vector<double> ke(numCopies, 0.0);
+    for (int i = 0; i < numSteps; i++) {
+        integ.step(1);
+        vector<State> state(numCopies);
+        for (int j = 0; j < numCopies; j++)
+            state[j] = integ.getState(j, State::Velocities, true);
+        for (int j = 0; j < numParticles; j++) {
+            for (int k = 0; k < numCopies; k++) {
+                Vec3 v = state[k].getVelocities()[j];
+                ke[k] += 0.5*system.getParticleMass(j)*v.dot(v);
+            }
+        }
+    }
+    double meanKE = 0.0;
+    for (int i = 0; i < numCopies; i++)
+        meanKE += ke[i];
+    meanKE /= numSteps*numCopies;
+    double expectedKE = 0.5*numCopies*numParticles*3*BOLTZ*temperature;
+    ASSERT_USUALLY_EQUAL_TOL(expectedKE, meanKE, 1e-2);
+}
 int main(int argc, char* argv[]) {
    try {
        registerRPMDCudaKernelFactories();