Commit 863447a6 authored by peastman's avatar peastman
Browse files

Created CUDA implementation of ring polymer contraction

parent 2e0ebb67
...@@ -69,6 +69,10 @@ CudaIntegrateRPMDStepKernel::~CudaIntegrateRPMDStepKernel() { ...@@ -69,6 +69,10 @@ CudaIntegrateRPMDStepKernel::~CudaIntegrateRPMDStepKernel() {
delete positions; delete positions;
if (velocities != NULL) if (velocities != NULL)
delete velocities; delete velocities;
if (contractedForces != NULL)
delete contractedForces;
if (contractedPositions != NULL)
delete contractedPositions;
} }
void CudaIntegrateRPMDStepKernel::initialize(const System& system, const RPMDIntegrator& integrator) { void CudaIntegrateRPMDStepKernel::initialize(const System& system, const RPMDIntegrator& integrator) {
...@@ -106,6 +110,34 @@ void CudaIntegrateRPMDStepKernel::initialize(const System& system, const RPMDInt ...@@ -106,6 +110,34 @@ void CudaIntegrateRPMDStepKernel::initialize(const System& system, const RPMDInt
temp[i] = make_float4(0, 0, 0, 1); temp[i] = make_float4(0, 0, 0, 1);
velocities->upload(temp); velocities->upload(temp);
} }
// Build a list of contractions.
groupsNotContracted = -1;
const map<int, int>& contractions = integrator.getContractions();
int maxContractedCopies = 0;
for (map<int, int>::const_iterator iter = contractions.begin(); iter != contractions.end(); ++iter) {
int group = iter->first;
int copies = iter->second;
if (group < 0 || group > 31)
throw OpenMMException("RPMDIntegrator: Force group must be between 0 and 31");
if (copies < 0 || copies > numCopies)
throw OpenMMException("RPMDIntegrator: Number of copies for contraction cannot be greater than the total number of copies being simulated");
if (copies != numCopies) {
if (groupsByCopies.find(copies) == groupsByCopies.end()) {
groupsByCopies[copies] = 1<<group;
groupsNotContracted -= 1<<group;
if (copies > maxContractedCopies)
maxContractedCopies = copies;
}
else
groupsByCopies[copies] |= 1<<group;
}
}
if (maxContractedCopies > 0) {
contractedForces = CudaArray::create<long long>(cu, maxContractedCopies*paddedParticles*3, "rpmdContractedForces");
contractedPositions = new CudaArray(cu, maxContractedCopies*paddedParticles, elementSize, "rpmdContractedPositions");
}
// Create kernels. // Create kernels.
...@@ -129,6 +161,23 @@ void CudaIntegrateRPMDStepKernel::initialize(const System& system, const RPMDInt ...@@ -129,6 +161,23 @@ void CudaIntegrateRPMDStepKernel::initialize(const System& system, const RPMDInt
copyToContextKernel = cu.getKernel(module, "copyDataToContext"); copyToContextKernel = cu.getKernel(module, "copyDataToContext");
copyFromContextKernel = cu.getKernel(module, "copyDataFromContext"); copyFromContextKernel = cu.getKernel(module, "copyDataFromContext");
translateKernel = cu.getKernel(module, "applyCellTranslations"); translateKernel = cu.getKernel(module, "applyCellTranslations");
// Create kernels for doing contractions.
for (map<int, int>::const_iterator iter = groupsByCopies.begin(); iter != groupsByCopies.end(); ++iter) {
int copies = iter->first;
replacements.clear();
replacements["NUM_CONTRACTED_COPIES"] = cu.intToString(copies);
replacements["POS_SCALE"] = cu.doubleToString(1.0/numCopies);
replacements["FORCE_SCALE"] = cu.doubleToString(1.0/copies);
replacements["FFT_Q_FORWARD"] = createFFT(numCopies, "q", true);
replacements["FFT_Q_BACKWARD"] = createFFT(copies, "q", false);
replacements["FFT_F_FORWARD"] = createFFT(copies, "f", true);
replacements["FFT_F_BACKWARD"] = createFFT(numCopies, "f", false);
module = cu.createModule(cu.replaceStrings(CudaKernelSources::vectorOps+CudaRpmdKernelSources::rpmdContraction, replacements), defines, "");
positionContractionKernels[copies] = cu.getKernel(module, "contractPositions");
forceContractionKernels[copies] = cu.getKernel(module, "contractForces");
}
} }
void CudaIntegrateRPMDStepKernel::execute(ContextImpl& context, const RPMDIntegrator& integrator, bool forcesAreValid) { void CudaIntegrateRPMDStepKernel::execute(ContextImpl& context, const RPMDIntegrator& integrator, bool forcesAreValid) {
...@@ -191,17 +240,49 @@ void CudaIntegrateRPMDStepKernel::execute(ContextImpl& context, const RPMDIntegr ...@@ -191,17 +240,49 @@ void CudaIntegrateRPMDStepKernel::execute(ContextImpl& context, const RPMDIntegr
} }
void CudaIntegrateRPMDStepKernel::computeForces(ContextImpl& context) { void CudaIntegrateRPMDStepKernel::computeForces(ContextImpl& context) {
// Compute forces from all groups that didn't have a specified contraction.
for (int i = 0; i < numCopies; i++) { for (int i = 0; i < numCopies; i++) {
void* copyToContextArgs[] = {&velocities->getDevicePointer(), &cu.getVelm().getDevicePointer(), &positions->getDevicePointer(), void* copyToContextArgs[] = {&velocities->getDevicePointer(), &cu.getVelm().getDevicePointer(), &positions->getDevicePointer(),
&cu.getPosq().getDevicePointer(), &cu.getAtomIndexArray().getDevicePointer(), &i}; &cu.getPosq().getDevicePointer(), &cu.getAtomIndexArray().getDevicePointer(), &i};
cu.executeKernel(copyToContextKernel, copyToContextArgs, cu.getNumAtoms()); cu.executeKernel(copyToContextKernel, copyToContextArgs, cu.getNumAtoms());
context.computeVirtualSites(); context.computeVirtualSites();
context.updateContextState(); context.updateContextState();
context.calcForcesAndEnergy(true, false); context.calcForcesAndEnergy(true, false, groupsNotContracted);
void* copyFromContextArgs[] = {&cu.getForce().getDevicePointer(), &forces->getDevicePointer(), &cu.getVelm().getDevicePointer(), void* copyFromContextArgs[] = {&cu.getForce().getDevicePointer(), &forces->getDevicePointer(), &cu.getVelm().getDevicePointer(),
&velocities->getDevicePointer(), &cu.getPosq().getDevicePointer(), &positions->getDevicePointer(), &cu.getAtomIndexArray().getDevicePointer(), &i}; &velocities->getDevicePointer(), &cu.getPosq().getDevicePointer(), &positions->getDevicePointer(), &cu.getAtomIndexArray().getDevicePointer(), &i};
cu.executeKernel(copyFromContextKernel, copyFromContextArgs, cu.getNumAtoms()); cu.executeKernel(copyFromContextKernel, copyFromContextArgs, cu.getNumAtoms());
} }
// Now loop over contractions and compute forces from them.
for (map<int, int>::const_iterator iter = groupsByCopies.begin(); iter != groupsByCopies.end(); ++iter) {
int copies = iter->first;
int groupFlags = iter->second;
// Find the contracted positions.
void* contractPosArgs[] = {&positions->getDevicePointer(), &contractedPositions->getDevicePointer()};
cu.executeKernel(positionContractionKernels[copies], contractPosArgs, numParticles*numCopies, workgroupSize);
// Compute forces.
for (int i = 0; i < copies; i++) {
void* copyToContextArgs[] = {&velocities->getDevicePointer(), &cu.getVelm().getDevicePointer(), &contractedPositions->getDevicePointer(),
&cu.getPosq().getDevicePointer(), &cu.getAtomIndexArray().getDevicePointer(), &i};
cu.executeKernel(copyToContextKernel, copyToContextArgs, cu.getNumAtoms());
context.computeVirtualSites();
context.calcForcesAndEnergy(true, false, groupFlags);
void* copyFromContextArgs[] = {&cu.getForce().getDevicePointer(), &contractedForces->getDevicePointer(), &cu.getVelm().getDevicePointer(),
&velocities->getDevicePointer(), &cu.getPosq().getDevicePointer(), &contractedPositions->getDevicePointer(), &cu.getAtomIndexArray().getDevicePointer(), &i};
cu.executeKernel(copyFromContextKernel, copyFromContextArgs, cu.getNumAtoms());
}
// Apply the forces to the original copies.
void* contractForceArgs[] = {&forces->getDevicePointer(), &contractedForces->getDevicePointer()};
cu.executeKernel(forceContractionKernels[copies], contractForceArgs, numParticles*numCopies, workgroupSize);
}
} }
double CudaIntegrateRPMDStepKernel::computeKineticEnergy(ContextImpl& context, const RPMDIntegrator& integrator) { double CudaIntegrateRPMDStepKernel::computeKineticEnergy(ContextImpl& context, const RPMDIntegrator& integrator) {
......
...@@ -35,6 +35,7 @@ ...@@ -35,6 +35,7 @@
#include "openmm/RpmdKernels.h" #include "openmm/RpmdKernels.h"
#include "CudaContext.h" #include "CudaContext.h"
#include "CudaArray.h" #include "CudaArray.h"
#include <map>
namespace OpenMM { namespace OpenMM {
...@@ -45,7 +46,7 @@ namespace OpenMM { ...@@ -45,7 +46,7 @@ namespace OpenMM {
class CudaIntegrateRPMDStepKernel : public IntegrateRPMDStepKernel { class CudaIntegrateRPMDStepKernel : public IntegrateRPMDStepKernel {
public: public:
CudaIntegrateRPMDStepKernel(std::string name, const Platform& platform, CudaContext& cu) : CudaIntegrateRPMDStepKernel(std::string name, const Platform& platform, CudaContext& cu) :
IntegrateRPMDStepKernel(name, platform), cu(cu), forces(NULL), positions(NULL), velocities(NULL) { IntegrateRPMDStepKernel(name, platform), cu(cu), forces(NULL), positions(NULL), velocities(NULL), contractedForces(NULL), contractedPositions(NULL) {
} }
~CudaIntegrateRPMDStepKernel(); ~CudaIntegrateRPMDStepKernel();
/** /**
...@@ -88,10 +89,16 @@ private: ...@@ -88,10 +89,16 @@ private:
std::string createFFT(int size, const std::string& variable, bool forward); std::string createFFT(int size, const std::string& variable, bool forward);
CudaContext& cu; CudaContext& cu;
int numCopies, numParticles, workgroupSize; int numCopies, numParticles, workgroupSize;
std::map<int, int> groupsByCopies;
int groupsNotContracted;
CudaArray* forces; CudaArray* forces;
CudaArray* positions; CudaArray* positions;
CudaArray* velocities; CudaArray* velocities;
CudaArray* contractedForces;
CudaArray* contractedPositions;
CUfunction pileKernel, stepKernel, velocitiesKernel, copyToContextKernel, copyFromContextKernel, translateKernel; CUfunction pileKernel, stepKernel, velocitiesKernel, copyToContextKernel, copyFromContextKernel, translateKernel;
std::map<int, CUfunction> positionContractionKernels;
std::map<int, CUfunction> forceContractionKernels;
}; };
} // namespace OpenMM } // namespace OpenMM
......
__device__ mixed3 multiplyComplexRealPart(mixed2 c1, mixed3 c2r, mixed3 c2i) {
return c1.x*c2r-c1.y*c2i;
}
__device__ mixed3 multiplyComplexImagPart(mixed2 c1, mixed3 c2r, mixed3 c2i) {
return c1.x*c2i+c1.y*c2r;
}
__device__ mixed3 multiplyComplexRealPartConj(mixed2 c1, mixed3 c2r, mixed3 c2i) {
return c1.x*c2r+c1.y*c2i;
}
__device__ mixed3 multiplyComplexImagPartConj(mixed2 c1, mixed3 c2r, mixed3 c2i) {
return c1.x*c2i-c1.y*c2r;
}
/**
* Compute the contracted positions
*/
extern "C" __global__ void contractPositions(mixed4* posq, mixed4* contracted) {
const int numBlocks = (blockDim.x*gridDim.x)/NUM_COPIES;
const int blockStart = NUM_COPIES*(threadIdx.x/NUM_COPIES);
const int indexInBlock = threadIdx.x-blockStart;
__shared__ mixed3 q[2*THREAD_BLOCK_SIZE];
__shared__ mixed3 temp[2*THREAD_BLOCK_SIZE];
__shared__ mixed2 w[NUM_COPIES];
mixed3* qreal = &q[blockStart];
mixed3* qimag = &q[blockStart+blockDim.x];
mixed3* tempreal = &temp[blockStart];
mixed3* tempimag = &temp[blockStart+blockDim.x];
if (threadIdx.x < NUM_COPIES)
w[indexInBlock] = make_mixed2(cos(-indexInBlock*2*M_PI/NUM_COPIES), sin(-indexInBlock*2*M_PI/NUM_COPIES));
__syncthreads();
for (int particle = (blockIdx.x*blockDim.x+threadIdx.x)/NUM_COPIES; particle < NUM_ATOMS; particle += numBlocks) {
// Load the particle position.
mixed4 particlePosq = posq[particle+indexInBlock*PADDED_NUM_ATOMS];
qreal[indexInBlock] = make_mixed3(particlePosq.x, particlePosq.y, particlePosq.z);
qimag[indexInBlock] = make_mixed3(0);
// Forward FFT.
__syncthreads();
FFT_Q_FORWARD
if (NUM_CONTRACTED_COPIES > 1) {
// Compress the data to remove high frequencies.
int start = (NUM_CONTRACTED_COPIES+1)/2;
tempreal[indexInBlock] = qreal[indexInBlock];
tempimag[indexInBlock] = qimag[indexInBlock];
__syncthreads();
if (indexInBlock < NUM_CONTRACTED_COPIES) {
qreal[indexInBlock] = tempreal[indexInBlock < start ? indexInBlock : indexInBlock+(NUM_COPIES-NUM_CONTRACTED_COPIES)];
qimag[indexInBlock] = tempimag[indexInBlock < start ? indexInBlock : indexInBlock+(NUM_COPIES-NUM_CONTRACTED_COPIES)];
}
__syncthreads();
FFT_Q_BACKWARD
}
// Store results.
if (indexInBlock < NUM_CONTRACTED_COPIES)
contracted[particle+indexInBlock*PADDED_NUM_ATOMS] = make_mixed4(POS_SCALE*qreal[indexInBlock].x, POS_SCALE*qreal[indexInBlock].y, POS_SCALE*qreal[indexInBlock].z, particlePosq.w);
}
}
/**
* Apply the contracted forces to all copies.
*/
extern "C" __global__ void contractForces(long long* force, long long* contracted) {
const int numBlocks = (blockDim.x*gridDim.x)/NUM_COPIES;
const int blockStart = NUM_COPIES*(threadIdx.x/NUM_COPIES);
const int indexInBlock = threadIdx.x-blockStart;
const mixed forceScale = 1/(mixed) 0x100000000;
__shared__ mixed3 f[2*THREAD_BLOCK_SIZE];
__shared__ mixed3 temp[2*THREAD_BLOCK_SIZE];
__shared__ mixed2 w[NUM_COPIES];
mixed3* freal = &f[blockStart];
mixed3* fimag = &f[blockStart+blockDim.x];
mixed3* tempreal = &temp[blockStart];
mixed3* tempimag = &temp[blockStart+blockDim.x];
if (threadIdx.x < NUM_COPIES)
w[indexInBlock] = make_mixed2(cos(-indexInBlock*2*M_PI/NUM_COPIES), sin(-indexInBlock*2*M_PI/NUM_COPIES));
__syncthreads();
for (int particle = (blockIdx.x*blockDim.x+threadIdx.x)/NUM_COPIES; particle < NUM_ATOMS; particle += numBlocks) {
// Load the force.
int forceIndex = particle+indexInBlock*PADDED_NUM_ATOMS*3;
if (indexInBlock < NUM_CONTRACTED_COPIES) {
freal[indexInBlock] = make_mixed3(contracted[forceIndex], contracted[forceIndex+PADDED_NUM_ATOMS], contracted[forceIndex+PADDED_NUM_ATOMS*2]);
fimag[indexInBlock] = make_mixed3(0);
}
__syncthreads();
// Forward FFT.
if (NUM_CONTRACTED_COPIES > 1) {
FFT_F_FORWARD
}
// Set the high frequency components to 0.
int start = (NUM_CONTRACTED_COPIES+1)/2;
int end = NUM_COPIES-NUM_CONTRACTED_COPIES+start;
tempreal[indexInBlock] = freal[indexInBlock];
tempimag[indexInBlock] = fimag[indexInBlock];
__syncthreads();
if (indexInBlock >= start) {
freal[indexInBlock] = (indexInBlock < end ? make_mixed3(0) : tempreal[indexInBlock-(NUM_COPIES-NUM_CONTRACTED_COPIES)]);
fimag[indexInBlock] = (indexInBlock < end ? make_mixed3(0) : tempimag[indexInBlock-(NUM_COPIES-NUM_CONTRACTED_COPIES)]);
}
__syncthreads();
FFT_F_BACKWARD
// Store results.
force[forceIndex] = FORCE_SCALE*freal[indexInBlock].x;
force[forceIndex+PADDED_NUM_ATOMS] = FORCE_SCALE*freal[indexInBlock].y;
force[forceIndex+PADDED_NUM_ATOMS*2] = FORCE_SCALE*freal[indexInBlock].z;
}
}
...@@ -355,6 +355,82 @@ void testVirtualSites() { ...@@ -355,6 +355,82 @@ void testVirtualSites() {
ASSERT_USUALLY_EQUAL_TOL(expectedKE, meanKE, 1e-2); ASSERT_USUALLY_EQUAL_TOL(expectedKE, meanKE, 1e-2);
} }
void testContractions() {
const int gridSize = 3;
const int numMolecules = gridSize*gridSize*gridSize;
const int numParticles = numMolecules*2;
const int numCopies = 10;
const double spacing = 2.0;
const double cutoff = 3.0;
const double boxSize = spacing*(gridSize+1);
const double temperature = 300.0;
System system;
system.setDefaultPeriodicBoxVectors(Vec3(boxSize, 0, 0), Vec3(0, boxSize, 0), Vec3(0, 0, boxSize));
HarmonicBondForce* bonds = new HarmonicBondForce();
system.addForce(bonds);
NonbondedForce* nonbonded = new NonbondedForce();
nonbonded->setCutoffDistance(cutoff);
nonbonded->setNonbondedMethod(NonbondedForce::PME);
nonbonded->setForceGroup(1);
nonbonded->setReciprocalSpaceForceGroup(2);
system.addForce(nonbonded);
// Create a cloud of molecules.
OpenMM_SFMT::SFMT sfmt;
init_gen_rand(0, sfmt);
vector<Vec3> positions(numParticles);
for (int i = 0; i < numMolecules; i++) {
system.addParticle(1.0);
system.addParticle(1.0);
nonbonded->addParticle(-0.2, 0.2, 0.2);
nonbonded->addParticle(0.2, 0.2, 0.2);
nonbonded->addException(2*i, 2*i+1, 0, 1, 0);
bonds->addBond(2*i, 2*i+1, 1.0, 10000.0);
}
map<int, int> contractions;
contractions[1] = 3;
contractions[2] = 1;
RPMDIntegrator integ(numCopies, temperature, 10.0, 0.001, contractions);
Platform& platform = Platform::getPlatformByName("CUDA");
Context context(system, integ, platform);
for (int copy = 0; copy < numCopies; copy++) {
for (int i = 0; i < gridSize; i++)
for (int j = 0; j < gridSize; j++)
for (int k = 0; k < gridSize; k++) {
Vec3 pos = Vec3(spacing*(i+0.02*genrand_real2(sfmt)), spacing*(j+0.02*genrand_real2(sfmt)), spacing*(k+0.02*genrand_real2(sfmt)));
int index = k+gridSize*(j+gridSize*i);
positions[2*index] = pos;
positions[2*index+1] = Vec3(pos[0]+1.0, pos[1], pos[2]);
}
integ.setPositions(copy, positions);
}
// Check the temperature.
const int numSteps = 1000;
integ.step(1000);
vector<double> ke(numCopies, 0.0);
for (int i = 0; i < numSteps; i++) {
integ.step(1);
vector<State> state(numCopies);
for (int j = 0; j < numCopies; j++)
state[j] = integ.getState(j, State::Velocities, true);
for (int j = 0; j < numParticles; j++) {
for (int k = 0; k < numCopies; k++) {
Vec3 v = state[k].getVelocities()[j];
ke[k] += 0.5*system.getParticleMass(j)*v.dot(v);
}
}
}
double meanKE = 0.0;
for (int i = 0; i < numCopies; i++)
meanKE += ke[i];
meanKE /= numSteps*numCopies;
double expectedKE = 0.5*numCopies*numParticles*3*BOLTZ*temperature;
ASSERT_USUALLY_EQUAL_TOL(expectedKE, meanKE, 1e-2);
}
int main(int argc, char* argv[]) { int main(int argc, char* argv[]) {
try { try {
registerRPMDCudaKernelFactories(); registerRPMDCudaKernelFactories();
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment