Commit cd6af26e authored by Peter Eastman's avatar Peter Eastman
Browse files

RPMD supports mixed and double precision

parent b8b2e1ef
...@@ -70,6 +70,7 @@ CudaIntegrateRPMDStepKernel::~CudaIntegrateRPMDStepKernel() { ...@@ -70,6 +70,7 @@ CudaIntegrateRPMDStepKernel::~CudaIntegrateRPMDStepKernel() {
if (velocities != NULL) if (velocities != NULL)
delete velocities; delete velocities;
} }
void CudaIntegrateRPMDStepKernel::initialize(const System& system, const RPMDIntegrator& integrator) { void CudaIntegrateRPMDStepKernel::initialize(const System& system, const RPMDIntegrator& integrator) {
cu.getPlatformData().initializeContexts(system); cu.getPlatformData().initializeContexts(system);
numCopies = integrator.getNumCopies(); numCopies = integrator.getNumCopies();
...@@ -80,20 +81,33 @@ void CudaIntegrateRPMDStepKernel::initialize(const System& system, const RPMDInt ...@@ -80,20 +81,33 @@ void CudaIntegrateRPMDStepKernel::initialize(const System& system, const RPMDInt
if (numCopies != findFFTDimension(numCopies)) if (numCopies != findFFTDimension(numCopies))
throw OpenMMException("RPMDIntegrator: the number of copies must be a multiple of powers of 2, 3, and 5."); throw OpenMMException("RPMDIntegrator: the number of copies must be a multiple of powers of 2, 3, and 5.");
int paddedParticles = cu.getPaddedNumAtoms(); int paddedParticles = cu.getPaddedNumAtoms();
bool useDoublePrecision = (cu.getUseDoublePrecision() || cu.getUseMixedPrecision());
int elementSize = (useDoublePrecision ? sizeof(double4) : sizeof(float4));
forces = CudaArray::create<long long>(cu, numCopies*paddedParticles*3, "rpmdForces"); forces = CudaArray::create<long long>(cu, numCopies*paddedParticles*3, "rpmdForces");
positions = CudaArray::create<float4>(cu, numCopies*paddedParticles, "rpmdPositions"); positions = new CudaArray(cu, numCopies*paddedParticles, elementSize, "rpmdPositions");
velocities = CudaArray::create<float4>(cu, numCopies*paddedParticles, "rpmdVelocities"); velocities = new CudaArray(cu, numCopies*paddedParticles, elementSize, "rpmdVelocities");
cu.getIntegrationUtilities().initRandomNumberGenerator((unsigned int) integrator.getRandomNumberSeed()); cu.getIntegrationUtilities().initRandomNumberGenerator((unsigned int) integrator.getRandomNumberSeed());
// Fill in the posq and velm arrays with safe values to avoid a risk of nans. // Fill in the posq and velm arrays with safe values to avoid a risk of nans.
vector<float4> temp(positions->getSize()); if (useDoublePrecision) {
for (int i = 0; i < positions->getSize(); i++) vector<double4> temp(positions->getSize());
temp[i] = make_float4(0, 0, 0, 0); for (int i = 0; i < positions->getSize(); i++)
positions->upload(temp); temp[i] = make_double4(0, 0, 0, 0);
for (int i = 0; i < velocities->getSize(); i++) positions->upload(temp);
temp[i] = make_float4(0, 0, 0, 1); for (int i = 0; i < velocities->getSize(); i++)
velocities->upload(temp); temp[i] = make_double4(0, 0, 0, 1);
velocities->upload(temp);
}
else {
vector<float4> temp(positions->getSize());
for (int i = 0; i < positions->getSize(); i++)
temp[i] = make_float4(0, 0, 0, 0);
positions->upload(temp);
for (int i = 0; i < velocities->getSize(); i++)
temp[i] = make_float4(0, 0, 0, 1);
velocities->upload(temp);
}
// Create kernels. // Create kernels.
...@@ -114,8 +128,9 @@ void CudaIntegrateRPMDStepKernel::initialize(const System& system, const RPMDInt ...@@ -114,8 +128,9 @@ void CudaIntegrateRPMDStepKernel::initialize(const System& system, const RPMDInt
pileKernel = cu.getKernel(module, "applyPileThermostat"); pileKernel = cu.getKernel(module, "applyPileThermostat");
stepKernel = cu.getKernel(module, "integrateStep"); stepKernel = cu.getKernel(module, "integrateStep");
velocitiesKernel = cu.getKernel(module, "advanceVelocities"); velocitiesKernel = cu.getKernel(module, "advanceVelocities");
copyToContextKernel = cu.getKernel(module, "copyToContext"); copyPositionsToContextKernel = cu.getKernel(module, "copyPositionsToContext");
copyFromContextKernel = cu.getKernel(module, "copyFromContext"); copyVelocitiesToContextKernel = cu.getKernel(module, "copyVelocitiesToContext");
copyForcesFromContextKernel = cu.getKernel(module, "copyForcesFromContext");
translateKernel = cu.getKernel(module, "applyCellTranslations"); translateKernel = cu.getKernel(module, "applyCellTranslations");
} }
...@@ -129,19 +144,23 @@ void CudaIntegrateRPMDStepKernel::execute(ContextImpl& context, const RPMDIntegr ...@@ -129,19 +144,23 @@ void CudaIntegrateRPMDStepKernel::execute(ContextImpl& context, const RPMDIntegr
// Apply the PILE-L thermostat. // Apply the PILE-L thermostat.
bool useDoublePrecision = (cu.getUseDoublePrecision() || cu.getUseMixedPrecision());
double dt = integrator.getStepSize(); double dt = integrator.getStepSize();
float dtFloat = (float) dt; float dtFloat = (float) dt;
void* dtPtr = (useDoublePrecision ? (void*) &dt : (void*) &dtFloat);
double kT = integrator.getTemperature()*BOLTZ; double kT = integrator.getTemperature()*BOLTZ;
float kTFloat = (float) kT; float kTFloat = (float) kT;
void* kTPtr = (useDoublePrecision ? (void*) &kT : (void*) &kTFloat);
double friction = integrator.getFriction(); double friction = integrator.getFriction();
float frictionFloat = (float) friction; float frictionFloat = (float) friction;
void* frictionPtr = (useDoublePrecision ? (void*) &friction : (void*) &frictionFloat);
int randomIndex = integration.prepareRandomNumbers(numParticles*numCopies); int randomIndex = integration.prepareRandomNumbers(numParticles*numCopies);
void* pileArgs[] = {&velocities->getDevicePointer(), &integration.getRandom().getDevicePointer(), &randomIndex, &dtFloat, &kTFloat, &frictionFloat}; void* pileArgs[] = {&velocities->getDevicePointer(), &integration.getRandom().getDevicePointer(), &randomIndex, dtPtr, kTPtr, frictionPtr};
cu.executeKernel(pileKernel, pileArgs, numParticles*numCopies, workgroupSize); cu.executeKernel(pileKernel, pileArgs, numParticles*numCopies, workgroupSize);
// Update positions and velocities. // Update positions and velocities.
void* stepArgs[] = {&positions->getDevicePointer(), &velocities->getDevicePointer(), &forces->getDevicePointer(), &dtFloat, &kTFloat}; void* stepArgs[] = {&positions->getDevicePointer(), &velocities->getDevicePointer(), &forces->getDevicePointer(), dtPtr, kTPtr};
cu.executeKernel(stepKernel, stepArgs, numParticles*numCopies, workgroupSize); cu.executeKernel(stepKernel, stepArgs, numParticles*numCopies, workgroupSize);
// Calculate forces based on the updated positions. // Calculate forces based on the updated positions.
...@@ -150,7 +169,7 @@ void CudaIntegrateRPMDStepKernel::execute(ContextImpl& context, const RPMDIntegr ...@@ -150,7 +169,7 @@ void CudaIntegrateRPMDStepKernel::execute(ContextImpl& context, const RPMDIntegr
// Update velocities. // Update velocities.
void* velocitiesArgs[] = {&velocities->getDevicePointer(), &forces->getDevicePointer(), &dtFloat}; void* velocitiesArgs[] = {&velocities->getDevicePointer(), &forces->getDevicePointer(), dtPtr};
cu.executeKernel(velocitiesKernel, velocitiesArgs, numParticles*numCopies, workgroupSize); cu.executeKernel(velocitiesKernel, velocitiesArgs, numParticles*numCopies, workgroupSize);
// Apply the PILE-L thermostat again. // Apply the PILE-L thermostat again.
...@@ -167,10 +186,10 @@ void CudaIntegrateRPMDStepKernel::execute(ContextImpl& context, const RPMDIntegr ...@@ -167,10 +186,10 @@ void CudaIntegrateRPMDStepKernel::execute(ContextImpl& context, const RPMDIntegr
void CudaIntegrateRPMDStepKernel::computeForces(ContextImpl& context) { void CudaIntegrateRPMDStepKernel::computeForces(ContextImpl& context) {
for (int i = 0; i < numCopies; i++) { for (int i = 0; i < numCopies; i++) {
void* copyToContextArgs[] = {&positions->getDevicePointer(), &cu.getPosq().getDevicePointer(), &cu.getAtomIndexArray().getDevicePointer(), &i}; void* copyToContextArgs[] = {&positions->getDevicePointer(), &cu.getPosq().getDevicePointer(), &cu.getAtomIndexArray().getDevicePointer(), &i};
cu.executeKernel(copyToContextKernel, copyToContextArgs, cu.getNumAtoms()); cu.executeKernel(copyPositionsToContextKernel, copyToContextArgs, cu.getNumAtoms());
context.calcForcesAndEnergy(true, false); context.calcForcesAndEnergy(true, false);
void* copyFromContextArgs[] = {&cu.getForce().getDevicePointer(), &forces->getDevicePointer(), &cu.getAtomIndexArray().getDevicePointer(), &i}; void* copyFromContextArgs[] = {&cu.getForce().getDevicePointer(), &forces->getDevicePointer(), &cu.getAtomIndexArray().getDevicePointer(), &i};
cu.executeKernel(copyFromContextKernel, copyFromContextArgs, cu.getNumAtoms()); cu.executeKernel(copyForcesFromContextKernel, copyFromContextArgs, cu.getNumAtoms());
if (cu.getAtomsWereReordered() && cu.getNonbondedUtilities().getUsePeriodic()) { if (cu.getAtomsWereReordered() && cu.getNonbondedUtilities().getUsePeriodic()) {
// Atoms may have been translated into a different periodic box, so apply // Atoms may have been translated into a different periodic box, so apply
// the same translation to all the beads. // the same translation to all the beads.
...@@ -190,11 +209,29 @@ void CudaIntegrateRPMDStepKernel::setPositions(int copy, const vector<Vec3>& pos ...@@ -190,11 +209,29 @@ void CudaIntegrateRPMDStepKernel::setPositions(int copy, const vector<Vec3>& pos
throw OpenMMException("RPMDIntegrator: Cannot set positions before the integrator is added to a Context"); throw OpenMMException("RPMDIntegrator: Cannot set positions before the integrator is added to a Context");
if (pos.size() != numParticles) if (pos.size() != numParticles)
throw OpenMMException("RPMDIntegrator: wrong number of values passed to setPositions()"); throw OpenMMException("RPMDIntegrator: wrong number of values passed to setPositions()");
vector<float4> posq(cu.getPaddedNumAtoms()); CUresult result;
cu.getPosq().download(posq); if (cu.getUseDoublePrecision()) {
for (int i = 0; i < numParticles; i++) vector<double4> posq(cu.getPaddedNumAtoms());
posq[i] = make_float4((float) pos[i][0], (float) pos[i][1], (float) pos[i][2], posq[i].w); cu.getPosq().download(posq);
CUresult result = cuMemcpyHtoD(positions->getDevicePointer()+copy*cu.getPaddedNumAtoms()*sizeof(float4), &posq[0], numParticles*sizeof(float4)); for (int i = 0; i < numParticles; i++)
posq[i] = make_double4(pos[i][0], pos[i][1], pos[i][2], posq[i].w);
result = cuMemcpyHtoD(positions->getDevicePointer()+copy*cu.getPaddedNumAtoms()*sizeof(double4), &posq[0], numParticles*sizeof(double4));
}
else if (cu.getUseMixedPrecision()) {
vector<float4> posqf(cu.getPaddedNumAtoms());
cu.getPosq().download(posqf);
vector<double4> posq(cu.getPaddedNumAtoms());
for (int i = 0; i < numParticles; i++)
posq[i] = make_double4(pos[i][0], pos[i][1], pos[i][2], posqf[i].w);
result = cuMemcpyHtoD(positions->getDevicePointer()+copy*cu.getPaddedNumAtoms()*sizeof(double4), &posq[0], numParticles*sizeof(double4));
}
else {
vector<float4> posq(cu.getPaddedNumAtoms());
cu.getPosq().download(posq);
for (int i = 0; i < numParticles; i++)
posq[i] = make_float4((float) pos[i][0], (float) pos[i][1], (float) pos[i][2], posq[i].w);
result = cuMemcpyHtoD(positions->getDevicePointer()+copy*cu.getPaddedNumAtoms()*sizeof(float4), &posq[0], numParticles*sizeof(float4));
}
if (result != CUDA_SUCCESS) { if (result != CUDA_SUCCESS) {
std::stringstream str; std::stringstream str;
str<<"Error uploading array "<<positions->getName()<<": "<<CudaContext::getErrorString(result)<<" ("<<result<<")"; str<<"Error uploading array "<<positions->getName()<<": "<<CudaContext::getErrorString(result)<<" ("<<result<<")";
...@@ -207,11 +244,21 @@ void CudaIntegrateRPMDStepKernel::setVelocities(int copy, const vector<Vec3>& ve ...@@ -207,11 +244,21 @@ void CudaIntegrateRPMDStepKernel::setVelocities(int copy, const vector<Vec3>& ve
throw OpenMMException("RPMDIntegrator: Cannot set velocities before the integrator is added to a Context"); throw OpenMMException("RPMDIntegrator: Cannot set velocities before the integrator is added to a Context");
if (vel.size() != numParticles) if (vel.size() != numParticles)
throw OpenMMException("RPMDIntegrator: wrong number of values passed to setVelocities()"); throw OpenMMException("RPMDIntegrator: wrong number of values passed to setVelocities()");
vector<float4> velm(cu.getPaddedNumAtoms()); CUresult result;
cu.getVelm().download(velm); if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) {
for (int i = 0; i < numParticles; i++) vector<double4> velm(cu.getPaddedNumAtoms());
velm[i] = make_float4((float) vel[i][0], (float) vel[i][1], (float) vel[i][2], velm[i].w); cu.getVelm().download(velm);
CUresult result = cuMemcpyHtoD(velocities->getDevicePointer()+copy*cu.getPaddedNumAtoms()*sizeof(float4), &velm[0], numParticles*sizeof(float4)); for (int i = 0; i < numParticles; i++)
velm[i] = make_double4(vel[i][0], vel[i][1], vel[i][2], velm[i].w);
result = cuMemcpyHtoD(velocities->getDevicePointer()+copy*cu.getPaddedNumAtoms()*sizeof(double4), &velm[0], numParticles*sizeof(double4));
}
else {
vector<float4> velm(cu.getPaddedNumAtoms());
cu.getVelm().download(velm);
for (int i = 0; i < numParticles; i++)
velm[i] = make_float4((float) vel[i][0], (float) vel[i][1], (float) vel[i][2], velm[i].w);
result = cuMemcpyHtoD(velocities->getDevicePointer()+copy*cu.getPaddedNumAtoms()*sizeof(float4), &velm[0], numParticles*sizeof(float4));
}
if (result != CUDA_SUCCESS) { if (result != CUDA_SUCCESS) {
std::stringstream str; std::stringstream str;
str<<"Error uploading array "<<velocities->getName()<<": "<<CudaContext::getErrorString(result)<<" ("<<result<<")"; str<<"Error uploading array "<<velocities->getName()<<": "<<CudaContext::getErrorString(result)<<" ("<<result<<")";
...@@ -221,9 +268,9 @@ void CudaIntegrateRPMDStepKernel::setVelocities(int copy, const vector<Vec3>& ve ...@@ -221,9 +268,9 @@ void CudaIntegrateRPMDStepKernel::setVelocities(int copy, const vector<Vec3>& ve
void CudaIntegrateRPMDStepKernel::copyToContext(int copy, ContextImpl& context) { void CudaIntegrateRPMDStepKernel::copyToContext(int copy, ContextImpl& context) {
void* copyPositionsArgs[] = {&positions->getDevicePointer(), &cu.getPosq().getDevicePointer(), &cu.getAtomIndexArray().getDevicePointer(), &copy}; void* copyPositionsArgs[] = {&positions->getDevicePointer(), &cu.getPosq().getDevicePointer(), &cu.getAtomIndexArray().getDevicePointer(), &copy};
cu.executeKernel(copyToContextKernel, copyPositionsArgs, cu.getNumAtoms()); cu.executeKernel(copyPositionsToContextKernel, copyPositionsArgs, cu.getNumAtoms());
void* copyVelocitiesArgs[] = {&velocities->getDevicePointer(), &cu.getVelm().getDevicePointer(), &cu.getAtomIndexArray().getDevicePointer(), &copy}; void* copyVelocitiesArgs[] = {&velocities->getDevicePointer(), &cu.getVelm().getDevicePointer(), &cu.getAtomIndexArray().getDevicePointer(), &copy};
cu.executeKernel(copyToContextKernel, copyVelocitiesArgs, cu.getNumAtoms()); cu.executeKernel(copyVelocitiesToContextKernel, copyVelocitiesArgs, cu.getNumAtoms());
} }
string CudaIntegrateRPMDStepKernel::createFFT(int size, const string& variable, bool forward) { string CudaIntegrateRPMDStepKernel::createFFT(int size, const string& variable, bool forward) {
...@@ -237,10 +284,10 @@ string CudaIntegrateRPMDStepKernel::createFFT(int size, const string& variable, ...@@ -237,10 +284,10 @@ string CudaIntegrateRPMDStepKernel::createFFT(int size, const string& variable,
string multImag = (forward ? "multiplyComplexImagPart" : "multiplyComplexImagPartConj"); string multImag = (forward ? "multiplyComplexImagPart" : "multiplyComplexImagPartConj");
source<<"{\n"; source<<"{\n";
source<<"float3* real0 = "<<variable<<"real;\n"; source<<"mixed3* real0 = "<<variable<<"real;\n";
source<<"float3* imag0 = "<<variable<<"imag;\n"; source<<"mixed3* imag0 = "<<variable<<"imag;\n";
source<<"float3* real1 = &temp[blockStart];\n"; source<<"mixed3* real1 = &temp[blockStart];\n";
source<<"float3* imag1 = &temp[blockStart+blockDim.x];\n"; source<<"mixed3* imag1 = &temp[blockStart+blockDim.x];\n";
// Factor size, generating an appropriate block of code for each factor. // Factor size, generating an appropriate block of code for each factor.
...@@ -254,39 +301,39 @@ string CudaIntegrateRPMDStepKernel::createFFT(int size, const string& variable, ...@@ -254,39 +301,39 @@ string CudaIntegrateRPMDStepKernel::createFFT(int size, const string& variable,
source<<"if (indexInBlock < "<<(L*m)<<") {\n"; source<<"if (indexInBlock < "<<(L*m)<<") {\n";
source<<"int i = indexInBlock;\n"; source<<"int i = indexInBlock;\n";
source<<"int j = i/"<<m<<";\n"; source<<"int j = i/"<<m<<";\n";
source<<"float3 c0r = real"<<input<<"[i];\n"; source<<"mixed3 c0r = real"<<input<<"[i];\n";
source<<"float3 c0i = imag"<<input<<"[i];\n"; source<<"mixed3 c0i = imag"<<input<<"[i];\n";
source<<"float3 c1r = real"<<input<<"[i+"<<(L*m)<<"];\n"; source<<"mixed3 c1r = real"<<input<<"[i+"<<(L*m)<<"];\n";
source<<"float3 c1i = imag"<<input<<"[i+"<<(L*m)<<"];\n"; source<<"mixed3 c1i = imag"<<input<<"[i+"<<(L*m)<<"];\n";
source<<"float3 c2r = real"<<input<<"[i+"<<(2*L*m)<<"];\n"; source<<"mixed3 c2r = real"<<input<<"[i+"<<(2*L*m)<<"];\n";
source<<"float3 c2i = imag"<<input<<"[i+"<<(2*L*m)<<"];\n"; source<<"mixed3 c2i = imag"<<input<<"[i+"<<(2*L*m)<<"];\n";
source<<"float3 c3r = real"<<input<<"[i+"<<(3*L*m)<<"];\n"; source<<"mixed3 c3r = real"<<input<<"[i+"<<(3*L*m)<<"];\n";
source<<"float3 c3i = imag"<<input<<"[i+"<<(3*L*m)<<"];\n"; source<<"mixed3 c3i = imag"<<input<<"[i+"<<(3*L*m)<<"];\n";
source<<"float3 c4r = real"<<input<<"[i+"<<(4*L*m)<<"];\n"; source<<"mixed3 c4r = real"<<input<<"[i+"<<(4*L*m)<<"];\n";
source<<"float3 c4i = imag"<<input<<"[i+"<<(4*L*m)<<"];\n"; source<<"mixed3 c4i = imag"<<input<<"[i+"<<(4*L*m)<<"];\n";
source<<"float3 d0r = c1r+c4r;\n"; source<<"mixed3 d0r = c1r+c4r;\n";
source<<"float3 d0i = c1i+c4i;\n"; source<<"mixed3 d0i = c1i+c4i;\n";
source<<"float3 d1r = c2r+c3r;\n"; source<<"mixed3 d1r = c2r+c3r;\n";
source<<"float3 d1i = c2i+c3i;\n"; source<<"mixed3 d1i = c2i+c3i;\n";
source<<"float3 d2r = "<<cu.doubleToString(sin(0.4*M_PI))<<"*(c1r-c4r);\n"; source<<"mixed3 d2r = "<<cu.doubleToString(sin(0.4*M_PI))<<"*(c1r-c4r);\n";
source<<"float3 d2i = "<<cu.doubleToString(sin(0.4*M_PI))<<"*(c1i-c4i);\n"; source<<"mixed3 d2i = "<<cu.doubleToString(sin(0.4*M_PI))<<"*(c1i-c4i);\n";
source<<"float3 d3r = "<<cu.doubleToString(sin(0.4*M_PI))<<"*(c2r-c3r);\n"; source<<"mixed3 d3r = "<<cu.doubleToString(sin(0.4*M_PI))<<"*(c2r-c3r);\n";
source<<"float3 d3i = "<<cu.doubleToString(sin(0.4*M_PI))<<"*(c2i-c3i);\n"; source<<"mixed3 d3i = "<<cu.doubleToString(sin(0.4*M_PI))<<"*(c2i-c3i);\n";
source<<"float3 d4r = d0r+d1r;\n"; source<<"mixed3 d4r = d0r+d1r;\n";
source<<"float3 d4i = d0i+d1i;\n"; source<<"mixed3 d4i = d0i+d1i;\n";
source<<"float3 d5r = "<<cu.doubleToString(0.25*sqrt(5.0))<<"*(d0r-d1r);\n"; source<<"mixed3 d5r = "<<cu.doubleToString(0.25*sqrt(5.0))<<"*(d0r-d1r);\n";
source<<"float3 d5i = "<<cu.doubleToString(0.25*sqrt(5.0))<<"*(d0i-d1i);\n"; source<<"mixed3 d5i = "<<cu.doubleToString(0.25*sqrt(5.0))<<"*(d0i-d1i);\n";
source<<"float3 d6r = c0r-0.25f*d4r;\n"; source<<"mixed3 d6r = c0r-0.25f*d4r;\n";
source<<"float3 d6i = c0i-0.25f*d4i;\n"; source<<"mixed3 d6i = c0i-0.25f*d4i;\n";
source<<"float3 d7r = d6r+d5r;\n"; source<<"mixed3 d7r = d6r+d5r;\n";
source<<"float3 d7i = d6i+d5i;\n"; source<<"mixed3 d7i = d6i+d5i;\n";
source<<"float3 d8r = d6r-d5r;\n"; source<<"mixed3 d8r = d6r-d5r;\n";
source<<"float3 d8i = d6i-d5i;\n"; source<<"mixed3 d8i = d6i-d5i;\n";
string coeff = cu.doubleToString(sin(0.2*M_PI)/sin(0.4*M_PI)); string coeff = cu.doubleToString(sin(0.2*M_PI)/sin(0.4*M_PI));
source<<"float3 d9r = "<<sign<<"*(d2i+"<<coeff<<"*d3i);\n"; source<<"mixed3 d9r = "<<sign<<"*(d2i+"<<coeff<<"*d3i);\n";
source<<"float3 d9i = "<<sign<<"*(-d2r-"<<coeff<<"*d3r);\n"; source<<"mixed3 d9i = "<<sign<<"*(-d2r-"<<coeff<<"*d3r);\n";
source<<"float3 d10r = "<<sign<<"*("<<coeff<<"*d2i-d3i);\n"; source<<"mixed3 d10r = "<<sign<<"*("<<coeff<<"*d2i-d3i);\n";
source<<"float3 d10i = "<<sign<<"*(d3r-"<<coeff<<"*d2r);\n"; source<<"mixed3 d10i = "<<sign<<"*(d3r-"<<coeff<<"*d2r);\n";
source<<"real"<<output<<"[i+4*j*"<<m<<"] = c0r+d4r;\n"; source<<"real"<<output<<"[i+4*j*"<<m<<"] = c0r+d4r;\n";
source<<"imag"<<output<<"[i+4*j*"<<m<<"] = c0i+d4i;\n"; source<<"imag"<<output<<"[i+4*j*"<<m<<"] = c0i+d4i;\n";
source<<"real"<<output<<"[i+(4*j+1)*"<<m<<"] = "<<multReal<<"(w[j*"<<size<<"/"<<(5*L)<<"], d7r+d9r, d7i+d9i);\n"; source<<"real"<<output<<"[i+(4*j+1)*"<<m<<"] = "<<multReal<<"(w[j*"<<size<<"/"<<(5*L)<<"], d7r+d9r, d7i+d9i);\n";
...@@ -307,22 +354,22 @@ string CudaIntegrateRPMDStepKernel::createFFT(int size, const string& variable, ...@@ -307,22 +354,22 @@ string CudaIntegrateRPMDStepKernel::createFFT(int size, const string& variable,
source<<"if (indexInBlock < "<<(L*m)<<") {\n"; source<<"if (indexInBlock < "<<(L*m)<<") {\n";
source<<"int i = indexInBlock;\n"; source<<"int i = indexInBlock;\n";
source<<"int j = i/"<<m<<";\n"; source<<"int j = i/"<<m<<";\n";
source<<"float3 c0r = real"<<input<<"[i];\n"; source<<"mixed3 c0r = real"<<input<<"[i];\n";
source<<"float3 c0i = imag"<<input<<"[i];\n"; source<<"mixed3 c0i = imag"<<input<<"[i];\n";
source<<"float3 c1r = real"<<input<<"[i+"<<(L*m)<<"];\n"; source<<"mixed3 c1r = real"<<input<<"[i+"<<(L*m)<<"];\n";
source<<"float3 c1i = imag"<<input<<"[i+"<<(L*m)<<"];\n"; source<<"mixed3 c1i = imag"<<input<<"[i+"<<(L*m)<<"];\n";
source<<"float3 c2r = real"<<input<<"[i+"<<(2*L*m)<<"];\n"; source<<"mixed3 c2r = real"<<input<<"[i+"<<(2*L*m)<<"];\n";
source<<"float3 c2i = imag"<<input<<"[i+"<<(2*L*m)<<"];\n"; source<<"mixed3 c2i = imag"<<input<<"[i+"<<(2*L*m)<<"];\n";
source<<"float3 c3r = real"<<input<<"[i+"<<(3*L*m)<<"];\n"; source<<"mixed3 c3r = real"<<input<<"[i+"<<(3*L*m)<<"];\n";
source<<"float3 c3i = imag"<<input<<"[i+"<<(3*L*m)<<"];\n"; source<<"mixed3 c3i = imag"<<input<<"[i+"<<(3*L*m)<<"];\n";
source<<"float3 d0r = c0r+c2r;\n"; source<<"mixed3 d0r = c0r+c2r;\n";
source<<"float3 d0i = c0i+c2i;\n"; source<<"mixed3 d0i = c0i+c2i;\n";
source<<"float3 d1r = c0r-c2r;\n"; source<<"mixed3 d1r = c0r-c2r;\n";
source<<"float3 d1i = c0i-c2i;\n"; source<<"mixed3 d1i = c0i-c2i;\n";
source<<"float3 d2r = c1r+c3r;\n"; source<<"mixed3 d2r = c1r+c3r;\n";
source<<"float3 d2i = c1i+c3i;\n"; source<<"mixed3 d2i = c1i+c3i;\n";
source<<"float3 d3r = "<<sign<<"*(c1i-c3i);\n"; source<<"mixed3 d3r = "<<sign<<"*(c1i-c3i);\n";
source<<"float3 d3i = "<<sign<<"*(c3r-c1r);\n"; source<<"mixed3 d3i = "<<sign<<"*(c3r-c1r);\n";
source<<"real"<<output<<"[i+3*j*"<<m<<"] = d0r+d2r;\n"; source<<"real"<<output<<"[i+3*j*"<<m<<"] = d0r+d2r;\n";
source<<"imag"<<output<<"[i+3*j*"<<m<<"] = d0i+d2i;\n"; source<<"imag"<<output<<"[i+3*j*"<<m<<"] = d0i+d2i;\n";
source<<"real"<<output<<"[i+(3*j+1)*"<<m<<"] = "<<multReal<<"(w[j*"<<size<<"/"<<(4*L)<<"], d1r+d3r, d1i+d3i);\n"; source<<"real"<<output<<"[i+(3*j+1)*"<<m<<"] = "<<multReal<<"(w[j*"<<size<<"/"<<(4*L)<<"], d1r+d3r, d1i+d3i);\n";
...@@ -341,18 +388,18 @@ string CudaIntegrateRPMDStepKernel::createFFT(int size, const string& variable, ...@@ -341,18 +388,18 @@ string CudaIntegrateRPMDStepKernel::createFFT(int size, const string& variable,
source<<"if (indexInBlock < "<<(L*m)<<") {\n"; source<<"if (indexInBlock < "<<(L*m)<<") {\n";
source<<"int i = indexInBlock;\n"; source<<"int i = indexInBlock;\n";
source<<"int j = i/"<<m<<";\n"; source<<"int j = i/"<<m<<";\n";
source<<"float3 c0r = real"<<input<<"[i];\n"; source<<"mixed3 c0r = real"<<input<<"[i];\n";
source<<"float3 c0i = imag"<<input<<"[i];\n"; source<<"mixed3 c0i = imag"<<input<<"[i];\n";
source<<"float3 c1r = real"<<input<<"[i+"<<(L*m)<<"];\n"; source<<"mixed3 c1r = real"<<input<<"[i+"<<(L*m)<<"];\n";
source<<"float3 c1i = imag"<<input<<"[i+"<<(L*m)<<"];\n"; source<<"mixed3 c1i = imag"<<input<<"[i+"<<(L*m)<<"];\n";
source<<"float3 c2r = real"<<input<<"[i+"<<(2*L*m)<<"];\n"; source<<"mixed3 c2r = real"<<input<<"[i+"<<(2*L*m)<<"];\n";
source<<"float3 c2i = imag"<<input<<"[i+"<<(2*L*m)<<"];\n"; source<<"mixed3 c2i = imag"<<input<<"[i+"<<(2*L*m)<<"];\n";
source<<"float3 d0r = c1r+c2r;\n"; source<<"mixed3 d0r = c1r+c2r;\n";
source<<"float3 d0i = c1i+c2i;\n"; source<<"mixed3 d0i = c1i+c2i;\n";
source<<"float3 d1r = c0r-0.5f*d0r;\n"; source<<"mixed3 d1r = c0r-0.5f*d0r;\n";
source<<"float3 d1i = c0i-0.5f*d0i;\n"; source<<"mixed3 d1i = c0i-0.5f*d0i;\n";
source<<"float3 d2r = "<<sign<<"*"<<cu.doubleToString(sin(M_PI/3.0))<<"*(c1i-c2i);\n"; source<<"mixed3 d2r = "<<sign<<"*"<<cu.doubleToString(sin(M_PI/3.0))<<"*(c1i-c2i);\n";
source<<"float3 d2i = "<<sign<<"*"<<cu.doubleToString(sin(M_PI/3.0))<<"*(c2r-c1r);\n"; source<<"mixed3 d2i = "<<sign<<"*"<<cu.doubleToString(sin(M_PI/3.0))<<"*(c2r-c1r);\n";
source<<"real"<<output<<"[i+2*j*"<<m<<"] = c0r+d0r;\n"; source<<"real"<<output<<"[i+2*j*"<<m<<"] = c0r+d0r;\n";
source<<"imag"<<output<<"[i+2*j*"<<m<<"] = c0i+d0i;\n"; source<<"imag"<<output<<"[i+2*j*"<<m<<"] = c0i+d0i;\n";
source<<"real"<<output<<"[i+(2*j+1)*"<<m<<"] = "<<multReal<<"(w[j*"<<size<<"/"<<(3*L)<<"], d1r+d2r, d1i+d2i);\n"; source<<"real"<<output<<"[i+(2*j+1)*"<<m<<"] = "<<multReal<<"(w[j*"<<size<<"/"<<(3*L)<<"], d1r+d2r, d1i+d2i);\n";
...@@ -369,10 +416,10 @@ string CudaIntegrateRPMDStepKernel::createFFT(int size, const string& variable, ...@@ -369,10 +416,10 @@ string CudaIntegrateRPMDStepKernel::createFFT(int size, const string& variable,
source<<"if (indexInBlock < "<<(L*m)<<") {\n"; source<<"if (indexInBlock < "<<(L*m)<<") {\n";
source<<"int i = indexInBlock;\n"; source<<"int i = indexInBlock;\n";
source<<"int j = i/"<<m<<";\n"; source<<"int j = i/"<<m<<";\n";
source<<"float3 c0r = real"<<input<<"[i];\n"; source<<"mixed3 c0r = real"<<input<<"[i];\n";
source<<"float3 c0i = imag"<<input<<"[i];\n"; source<<"mixed3 c0i = imag"<<input<<"[i];\n";
source<<"float3 c1r = real"<<input<<"[i+"<<(L*m)<<"];\n"; source<<"mixed3 c1r = real"<<input<<"[i+"<<(L*m)<<"];\n";
source<<"float3 c1i = imag"<<input<<"[i+"<<(L*m)<<"];\n"; source<<"mixed3 c1i = imag"<<input<<"[i+"<<(L*m)<<"];\n";
source<<"real"<<output<<"[i+j*"<<m<<"] = c0r+c1r;\n"; source<<"real"<<output<<"[i+j*"<<m<<"] = c0r+c1r;\n";
source<<"imag"<<output<<"[i+j*"<<m<<"] = c0i+c1i;\n"; source<<"imag"<<output<<"[i+j*"<<m<<"] = c0i+c1i;\n";
source<<"real"<<output<<"[i+(j+1)*"<<m<<"] = "<<multReal<<"(w[j*"<<size<<"/"<<(2*L)<<"], c0r-c1r, c0i-c1i);\n"; source<<"real"<<output<<"[i+(j+1)*"<<m<<"] = "<<multReal<<"(w[j*"<<size<<"/"<<(2*L)<<"], c0r-c1r, c0i-c1i);\n";
......
...@@ -91,7 +91,7 @@ private: ...@@ -91,7 +91,7 @@ private:
CudaArray* forces; CudaArray* forces;
CudaArray* positions; CudaArray* positions;
CudaArray* velocities; CudaArray* velocities;
CUfunction pileKernel, stepKernel, velocitiesKernel, copyToContextKernel, copyFromContextKernel, translateKernel; CUfunction pileKernel, stepKernel, velocitiesKernel, copyPositionsToContextKernel, copyVelocitiesToContextKernel, copyForcesFromContextKernel, translateKernel;
}; };
} // namespace OpenMM } // namespace OpenMM
......
__device__ float3 multiplyComplexRealPart(float2 c1, float3 c2r, float3 c2i) { __device__ mixed3 multiplyComplexRealPart(mixed2 c1, mixed3 c2r, mixed3 c2i) {
return c1.x*c2r-c1.y*c2i; return c1.x*c2r-c1.y*c2i;
} }
__device__ float3 multiplyComplexImagPart(float2 c1, float3 c2r, float3 c2i) { __device__ mixed3 multiplyComplexImagPart(mixed2 c1, mixed3 c2r, mixed3 c2i) {
return c1.x*c2i+c1.y*c2r; return c1.x*c2i+c1.y*c2r;
} }
__device__ float3 multiplyComplexRealPartConj(float2 c1, float3 c2r, float3 c2i) { __device__ mixed3 multiplyComplexRealPartConj(mixed2 c1, mixed3 c2r, mixed3 c2i) {
return c1.x*c2r+c1.y*c2i; return c1.x*c2r+c1.y*c2i;
} }
__device__ float3 multiplyComplexImagPartConj(float2 c1, float3 c2r, float3 c2i) { __device__ mixed3 multiplyComplexImagPartConj(mixed2 c1, mixed3 c2r, mixed3 c2i) {
return c1.x*c2i-c1.y*c2r; return c1.x*c2i-c1.y*c2r;
} }
/** /**
* Apply the PILE-L thermostat. * Apply the PILE-L thermostat.
*/ */
extern "C" __global__ void applyPileThermostat(float4* velm, float4* random, unsigned int randomIndex, extern "C" __global__ void applyPileThermostat(mixed4* velm, float4* random, unsigned int randomIndex,
float dt, float kT, float friction) { mixed dt, mixed kT, mixed friction) {
const int numBlocks = blockDim.x*gridDim.x/NUM_COPIES; const int numBlocks = blockDim.x*gridDim.x/NUM_COPIES;
const int blockStart = NUM_COPIES*(threadIdx.x/NUM_COPIES); const int blockStart = NUM_COPIES*(threadIdx.x/NUM_COPIES);
const int indexInBlock = threadIdx.x-blockStart; const int indexInBlock = threadIdx.x-blockStart;
const float nkT = NUM_COPIES*kT; const mixed nkT = NUM_COPIES*kT;
const float twown = 2.0f*nkT/HBAR; const mixed twown = 2.0f*nkT/HBAR;
const float c1_0 = EXP(-0.5f*dt*friction); const mixed c1_0 = EXP(-0.5f*dt*friction);
const float c2_0 = SQRT(1.0f-c1_0*c1_0); const mixed c2_0 = SQRT(1.0f-c1_0*c1_0);
__shared__ float3 v[2*THREAD_BLOCK_SIZE]; __shared__ mixed3 v[2*THREAD_BLOCK_SIZE];
__shared__ float3 temp[2*THREAD_BLOCK_SIZE]; __shared__ mixed3 temp[2*THREAD_BLOCK_SIZE];
__shared__ float2 w[NUM_COPIES]; __shared__ mixed2 w[NUM_COPIES];
float3* vreal = &v[blockStart]; mixed3* vreal = &v[blockStart];
float3* vimag = &v[blockStart+blockDim.x]; mixed3* vimag = &v[blockStart+blockDim.x];
if (threadIdx.x < NUM_COPIES) if (threadIdx.x < NUM_COPIES)
w[indexInBlock] = make_float2(cos(-indexInBlock*2*M_PI/NUM_COPIES), sin(-indexInBlock*2*M_PI/NUM_COPIES)); w[indexInBlock] = make_mixed2(cos(-indexInBlock*2*M_PI/NUM_COPIES), sin(-indexInBlock*2*M_PI/NUM_COPIES));
__syncthreads(); __syncthreads();
randomIndex += NUM_COPIES*((blockIdx.x*blockDim.x+threadIdx.x)/NUM_COPIES); randomIndex += NUM_COPIES*((blockIdx.x*blockDim.x+threadIdx.x)/NUM_COPIES);
for (int particle = (blockIdx.x*blockDim.x+threadIdx.x)/NUM_COPIES; particle < NUM_ATOMS; particle += numBlocks) { for (int particle = (blockIdx.x*blockDim.x+threadIdx.x)/NUM_COPIES; particle < NUM_ATOMS; particle += numBlocks) {
float4 particleVelm = velm[particle+indexInBlock*PADDED_NUM_ATOMS]; mixed4 particleVelm = velm[particle+indexInBlock*PADDED_NUM_ATOMS];
float invMass = particleVelm.w; mixed invMass = particleVelm.w;
float c3_0 = c2_0*SQRT(nkT*invMass); mixed c3_0 = c2_0*SQRT(nkT*invMass);
// Forward FFT. // Forward FFT.
vreal[indexInBlock] = SCALE*make_float3(particleVelm.x, particleVelm.y, particleVelm.z); vreal[indexInBlock] = SCALE*make_mixed3(particleVelm.x, particleVelm.y, particleVelm.z);
vimag[indexInBlock] = make_float3(0); vimag[indexInBlock] = make_mixed3(0);
__syncthreads(); __syncthreads();
FFT_V_FORWARD FFT_V_FORWARD
...@@ -53,28 +53,28 @@ extern "C" __global__ void applyPileThermostat(float4* velm, float4* random, uns ...@@ -53,28 +53,28 @@ extern "C" __global__ void applyPileThermostat(float4* velm, float4* random, uns
// Apply a local Langevin thermostat to the centroid mode. // Apply a local Langevin thermostat to the centroid mode.
float4 rand = random[randomIndex]; float4 rand = random[randomIndex];
vreal[0] = vreal[0]*c1_0 + c3_0*make_float3(rand.x, rand.y, rand.z); vreal[0] = vreal[0]*c1_0 + c3_0*make_mixed3(rand.x, rand.y, rand.z);
} }
else { else {
// Use critical damping white noise for the remaining modes. // Use critical damping white noise for the remaining modes.
int k = (indexInBlock <= NUM_COPIES/2 ? indexInBlock : NUM_COPIES-indexInBlock); int k = (indexInBlock <= NUM_COPIES/2 ? indexInBlock : NUM_COPIES-indexInBlock);
const bool isCenter = (NUM_COPIES%2 == 0 && k == NUM_COPIES/2); const bool isCenter = (NUM_COPIES%2 == 0 && k == NUM_COPIES/2);
const float wk = twown*sin(k*M_PI/NUM_COPIES); const mixed wk = twown*sin(k*M_PI/NUM_COPIES);
const float c1 = EXP(-wk*dt); const mixed c1 = EXP(-wk*dt);
const float c2 = SQRT((1.0f-c1*c1)/2.0f) * (isCenter ? sqrt(2.0f) : 1.0f); const mixed c2 = SQRT((1.0f-c1*c1)/2.0f) * (isCenter ? sqrt(2.0f) : 1.0f);
const float c3 = c2*SQRT(nkT*invMass); const mixed c3 = c2*SQRT(nkT*invMass);
float4 rand1 = c3*random[randomIndex+k]; float4 rand1 = random[randomIndex+k];
float4 rand2 = (isCenter ? make_float4(0) : c3*random[randomIndex+NUM_COPIES-k]); float4 rand2 = (isCenter ? make_float4(0) : random[randomIndex+NUM_COPIES-k]);
vreal[indexInBlock] = c1*vreal[indexInBlock] + make_float3(rand1.x, rand1.y, rand1.z); vreal[indexInBlock] = c1*vreal[indexInBlock] + c3*make_mixed3(rand1.x, rand1.y, rand1.z);
vimag[indexInBlock] = c1*vimag[indexInBlock] + (indexInBlock < NUM_COPIES/2 ? make_float3(rand2.x, rand2.y, rand2.z) : make_float3(-rand2.x, -rand2.y, -rand2.z)); vimag[indexInBlock] = c1*vimag[indexInBlock] + c3*(indexInBlock < NUM_COPIES/2 ? make_mixed3(rand2.x, rand2.y, rand2.z) : make_mixed3(-rand2.x, -rand2.y, -rand2.z));
} }
__syncthreads(); __syncthreads();
// Inverse FFT. // Inverse FFT.
FFT_V_BACKWARD FFT_V_BACKWARD
velm[particle+indexInBlock*PADDED_NUM_ATOMS] = make_float4(SCALE*vreal[indexInBlock].x, SCALE*vreal[indexInBlock].y, SCALE*vreal[indexInBlock].z, particleVelm.w); velm[particle+indexInBlock*PADDED_NUM_ATOMS] = make_mixed4(SCALE*vreal[indexInBlock].x, SCALE*vreal[indexInBlock].y, SCALE*vreal[indexInBlock].z, particleVelm.w);
randomIndex += blockDim.x*gridDim.x; randomIndex += blockDim.x*gridDim.x;
} }
} }
...@@ -82,24 +82,24 @@ extern "C" __global__ void applyPileThermostat(float4* velm, float4* random, uns ...@@ -82,24 +82,24 @@ extern "C" __global__ void applyPileThermostat(float4* velm, float4* random, uns
/** /**
* Advance the positions and velocities. * Advance the positions and velocities.
*/ */
extern "C" __global__ void integrateStep(float4* posq, float4* velm, long long* force, float dt, float kT) { extern "C" __global__ void integrateStep(mixed4* posq, mixed4* velm, long long* force, mixed dt, mixed kT) {
const int numBlocks = (blockDim.x*gridDim.x)/NUM_COPIES; const int numBlocks = (blockDim.x*gridDim.x)/NUM_COPIES;
const int blockStart = NUM_COPIES*(threadIdx.x/NUM_COPIES); const int blockStart = NUM_COPIES*(threadIdx.x/NUM_COPIES);
const int indexInBlock = threadIdx.x-blockStart; const int indexInBlock = threadIdx.x-blockStart;
const float nkT = NUM_COPIES*kT; const mixed nkT = NUM_COPIES*kT;
const float twown = 2.0f*nkT/HBAR; const mixed twown = 2.0f*nkT/HBAR;
const float forceScale = 1/(float) 0xFFFFFFFF; const mixed forceScale = 1/(mixed) 0xFFFFFFFF;
__shared__ float3 q[2*THREAD_BLOCK_SIZE]; __shared__ mixed3 q[2*THREAD_BLOCK_SIZE];
__shared__ float3 v[2*THREAD_BLOCK_SIZE]; __shared__ mixed3 v[2*THREAD_BLOCK_SIZE];
__shared__ float3 temp[2*THREAD_BLOCK_SIZE]; __shared__ mixed3 temp[2*THREAD_BLOCK_SIZE];
__shared__ float2 w[NUM_COPIES]; __shared__ mixed2 w[NUM_COPIES];
// Update velocities. // Update velocities.
for (int particle = (blockIdx.x*blockDim.x+threadIdx.x)/NUM_COPIES; particle < NUM_ATOMS; particle += numBlocks) { for (int particle = (blockIdx.x*blockDim.x+threadIdx.x)/NUM_COPIES; particle < NUM_ATOMS; particle += numBlocks) {
int index = particle+indexInBlock*PADDED_NUM_ATOMS; int index = particle+indexInBlock*PADDED_NUM_ATOMS;
int forceIndex = particle+indexInBlock*PADDED_NUM_ATOMS*3; int forceIndex = particle+indexInBlock*PADDED_NUM_ATOMS*3;
float4 particleVelm = velm[index]; mixed4 particleVelm = velm[index];
particleVelm.x += forceScale*force[forceIndex]*(0.5f*dt*particleVelm.w); particleVelm.x += forceScale*force[forceIndex]*(0.5f*dt*particleVelm.w);
particleVelm.y += forceScale*force[forceIndex+PADDED_NUM_ATOMS]*(0.5f*dt*particleVelm.w); particleVelm.y += forceScale*force[forceIndex+PADDED_NUM_ATOMS]*(0.5f*dt*particleVelm.w);
particleVelm.z += forceScale*force[forceIndex+PADDED_NUM_ATOMS*2]*(0.5f*dt*particleVelm.w); particleVelm.z += forceScale*force[forceIndex+PADDED_NUM_ATOMS*2]*(0.5f*dt*particleVelm.w);
...@@ -108,23 +108,23 @@ extern "C" __global__ void integrateStep(float4* posq, float4* velm, long long* ...@@ -108,23 +108,23 @@ extern "C" __global__ void integrateStep(float4* posq, float4* velm, long long*
// Evolve the free ring polymer by transforming to the frequency domain. // Evolve the free ring polymer by transforming to the frequency domain.
float3* qreal = &q[blockStart]; mixed3* qreal = &q[blockStart];
float3* qimag = &q[blockStart+blockDim.x]; mixed3* qimag = &q[blockStart+blockDim.x];
float3* vreal = &v[blockStart]; mixed3* vreal = &v[blockStart];
float3* vimag = &v[blockStart+blockDim.x]; mixed3* vimag = &v[blockStart+blockDim.x];
if (threadIdx.x < NUM_COPIES) if (threadIdx.x < NUM_COPIES)
w[indexInBlock] = make_float2(cos(-indexInBlock*2*M_PI/NUM_COPIES), sin(-indexInBlock*2*M_PI/NUM_COPIES)); w[indexInBlock] = make_mixed2(cos(-indexInBlock*2*M_PI/NUM_COPIES), sin(-indexInBlock*2*M_PI/NUM_COPIES));
__syncthreads(); __syncthreads();
for (int particle = (blockIdx.x*blockDim.x+threadIdx.x)/NUM_COPIES; particle < NUM_ATOMS; particle += numBlocks) { for (int particle = (blockIdx.x*blockDim.x+threadIdx.x)/NUM_COPIES; particle < NUM_ATOMS; particle += numBlocks) {
float4 particlePosq = posq[particle+indexInBlock*PADDED_NUM_ATOMS]; mixed4 particlePosq = posq[particle+indexInBlock*PADDED_NUM_ATOMS];
float4 particleVelm = velm[particle+indexInBlock*PADDED_NUM_ATOMS]; mixed4 particleVelm = velm[particle+indexInBlock*PADDED_NUM_ATOMS];
// Forward FFT. // Forward FFT.
qreal[indexInBlock] = SCALE*make_float3(particlePosq.x, particlePosq.y, particlePosq.z); qreal[indexInBlock] = SCALE*make_mixed3(particlePosq.x, particlePosq.y, particlePosq.z);
qimag[indexInBlock] = make_float3(0); qimag[indexInBlock] = make_mixed3(0);
vreal[indexInBlock] = SCALE*make_float3(particleVelm.x, particleVelm.y, particleVelm.z); vreal[indexInBlock] = SCALE*make_mixed3(particleVelm.x, particleVelm.y, particleVelm.z);
vimag[indexInBlock] = make_float3(0); vimag[indexInBlock] = make_mixed3(0);
__syncthreads(); __syncthreads();
FFT_Q_FORWARD FFT_Q_FORWARD
FFT_V_FORWARD FFT_V_FORWARD
...@@ -136,12 +136,12 @@ extern "C" __global__ void integrateStep(float4* posq, float4* velm, long long* ...@@ -136,12 +136,12 @@ extern "C" __global__ void integrateStep(float4* posq, float4* velm, long long*
qimag[0] += vimag[0]*dt; qimag[0] += vimag[0]*dt;
} }
else { else {
const float wk = twown*sin(indexInBlock*M_PI/NUM_COPIES); const mixed wk = twown*sin(indexInBlock*M_PI/NUM_COPIES);
const float wt = wk*dt; const mixed wt = wk*dt;
const float coswt = cos(wt); const mixed coswt = cos(wt);
const float sinwt = sin(wt); const mixed sinwt = sin(wt);
const float3 vprimereal = vreal[indexInBlock]*coswt - qreal[indexInBlock]*(wk*sinwt); // Advance velocity from t to t+dt const mixed3 vprimereal = vreal[indexInBlock]*coswt - qreal[indexInBlock]*(wk*sinwt); // Advance velocity from t to t+dt
const float3 vprimeimag = vimag[indexInBlock]*coswt - qimag[indexInBlock]*(wk*sinwt); const mixed3 vprimeimag = vimag[indexInBlock]*coswt - qimag[indexInBlock]*(wk*sinwt);
qreal[indexInBlock] = vreal[indexInBlock]*(sinwt/wk) + qreal[indexInBlock]*coswt; // Advance position from t to t+dt qreal[indexInBlock] = vreal[indexInBlock]*(sinwt/wk) + qreal[indexInBlock]*coswt; // Advance position from t to t+dt
qimag[indexInBlock] = vimag[indexInBlock]*(sinwt/wk) + qimag[indexInBlock]*coswt; qimag[indexInBlock] = vimag[indexInBlock]*(sinwt/wk) + qimag[indexInBlock]*coswt;
vreal[indexInBlock] = vprimereal; vreal[indexInBlock] = vprimereal;
...@@ -153,26 +153,26 @@ extern "C" __global__ void integrateStep(float4* posq, float4* velm, long long* ...@@ -153,26 +153,26 @@ extern "C" __global__ void integrateStep(float4* posq, float4* velm, long long*
FFT_Q_BACKWARD FFT_Q_BACKWARD
FFT_V_BACKWARD FFT_V_BACKWARD
posq[particle+indexInBlock*PADDED_NUM_ATOMS] = make_float4(SCALE*qreal[indexInBlock].x, SCALE*qreal[indexInBlock].y, SCALE*qreal[indexInBlock].z, particlePosq.w); posq[particle+indexInBlock*PADDED_NUM_ATOMS] = make_mixed4(SCALE*qreal[indexInBlock].x, SCALE*qreal[indexInBlock].y, SCALE*qreal[indexInBlock].z, particlePosq.w);
velm[particle+indexInBlock*PADDED_NUM_ATOMS] = make_float4(SCALE*vreal[indexInBlock].x, SCALE*vreal[indexInBlock].y, SCALE*vreal[indexInBlock].z, particleVelm.w); velm[particle+indexInBlock*PADDED_NUM_ATOMS] = make_mixed4(SCALE*vreal[indexInBlock].x, SCALE*vreal[indexInBlock].y, SCALE*vreal[indexInBlock].z, particleVelm.w);
} }
} }
/** /**
* Advance the velocities by a half step. * Advance the velocities by a half step.
*/ */
extern "C" __global__ void advanceVelocities(float4* velm, long long* force, float dt) { extern "C" __global__ void advanceVelocities(mixed4* velm, long long* force, mixed dt) {
const int numBlocks = (blockDim.x*gridDim.x)/NUM_COPIES; const int numBlocks = (blockDim.x*gridDim.x)/NUM_COPIES;
const int blockStart = NUM_COPIES*(threadIdx.x/NUM_COPIES); const int blockStart = NUM_COPIES*(threadIdx.x/NUM_COPIES);
const int indexInBlock = threadIdx.x-blockStart; const int indexInBlock = threadIdx.x-blockStart;
const float forceScale = 1/(float) 0xFFFFFFFF; const mixed forceScale = 1/(mixed) 0xFFFFFFFF;
// Update velocities. // Update velocities.
for (int particle = (blockIdx.x*blockDim.x+threadIdx.x)/NUM_COPIES; particle < NUM_ATOMS; particle += numBlocks) { for (int particle = (blockIdx.x*blockDim.x+threadIdx.x)/NUM_COPIES; particle < NUM_ATOMS; particle += numBlocks) {
int index = particle+indexInBlock*PADDED_NUM_ATOMS; int index = particle+indexInBlock*PADDED_NUM_ATOMS;
int forceIndex = particle+indexInBlock*PADDED_NUM_ATOMS*3; int forceIndex = particle+indexInBlock*PADDED_NUM_ATOMS*3;
float4 particleVelm = velm[index]; mixed4 particleVelm = velm[index];
particleVelm.x += forceScale*force[forceIndex]*(0.5f*dt*particleVelm.w); particleVelm.x += forceScale*force[forceIndex]*(0.5f*dt*particleVelm.w);
particleVelm.y += forceScale*force[forceIndex+PADDED_NUM_ATOMS]*(0.5f*dt*particleVelm.w); particleVelm.y += forceScale*force[forceIndex+PADDED_NUM_ATOMS]*(0.5f*dt*particleVelm.w);
particleVelm.z += forceScale*force[forceIndex+PADDED_NUM_ATOMS*2]*(0.5f*dt*particleVelm.w); particleVelm.z += forceScale*force[forceIndex+PADDED_NUM_ATOMS*2]*(0.5f*dt*particleVelm.w);
...@@ -181,9 +181,20 @@ extern "C" __global__ void advanceVelocities(float4* velm, long long* force, flo ...@@ -181,9 +181,20 @@ extern "C" __global__ void advanceVelocities(float4* velm, long long* force, flo
} }
/** /**
* Copy a set of per-atom values from the integrator's arrays to the context. * Copy a set of positions from the integrator's arrays to the context.
*/ */
extern "C" __global__ void copyToContext(float4* src, float4* dst, int* order, int copy) { extern "C" __global__ void copyPositionsToContext(mixed4* src, real4* dst, int* order, int copy) {
const int base = copy*PADDED_NUM_ATOMS;
for (int particle = blockIdx.x*blockDim.x+threadIdx.x; particle < NUM_ATOMS; particle += blockDim.x*gridDim.x) {
mixed4 posq = src[base+order[particle]];
dst[particle] = make_real4(posq.x, posq.y, posq.z, posq.w);
}
}
/**
* Copy a set of velocities from the integrator's arrays to the context.
*/
extern "C" __global__ void copyVelocitiesToContext(mixed4* src, mixed4* dst, int* order, int copy) {
const int base = copy*PADDED_NUM_ATOMS; const int base = copy*PADDED_NUM_ATOMS;
for (int particle = blockIdx.x*blockDim.x+threadIdx.x; particle < NUM_ATOMS; particle += blockDim.x*gridDim.x) { for (int particle = blockIdx.x*blockDim.x+threadIdx.x; particle < NUM_ATOMS; particle += blockDim.x*gridDim.x) {
dst[particle] = src[base+order[particle]]; dst[particle] = src[base+order[particle]];
...@@ -191,9 +202,9 @@ extern "C" __global__ void copyToContext(float4* src, float4* dst, int* order, i ...@@ -191,9 +202,9 @@ extern "C" __global__ void copyToContext(float4* src, float4* dst, int* order, i
} }
/** /**
* Copy a set of per-atom force values from the context to the integrator's arrays. * Copy a set of forces from the context to the integrator's arrays.
*/ */
extern "C" __global__ void copyFromContext(long long* src, long long* dst, int* order, int copy) { extern "C" __global__ void copyForcesFromContext(long long* src, long long* dst, int* order, int copy) {
const int base = copy*PADDED_NUM_ATOMS*3; const int base = copy*PADDED_NUM_ATOMS*3;
for (int particle = blockIdx.x*blockDim.x+threadIdx.x; particle < NUM_ATOMS; particle += blockDim.x*gridDim.x) { for (int particle = blockIdx.x*blockDim.x+threadIdx.x; particle < NUM_ATOMS; particle += blockDim.x*gridDim.x) {
dst[base+order[particle]] = src[particle]; dst[base+order[particle]] = src[particle];
...@@ -205,10 +216,11 @@ extern "C" __global__ void copyFromContext(long long* src, long long* dst, int* ...@@ -205,10 +216,11 @@ extern "C" __global__ void copyFromContext(long long* src, long long* dst, int*
/** /**
* Update atom positions so all copies are offset by the same number of periodic box widths. * Update atom positions so all copies are offset by the same number of periodic box widths.
*/ */
extern "C" __global__ void applyCellTranslations(float4* posq, float4* movedPos, int* order, int movedCopy) { extern "C" __global__ void applyCellTranslations(mixed4* posq, real4* movedPos, int* order, int movedCopy) {
for (int particle = blockIdx.x*blockDim.x+threadIdx.x; particle < NUM_ATOMS; particle += blockDim.x*gridDim.x) { for (int particle = blockIdx.x*blockDim.x+threadIdx.x; particle < NUM_ATOMS; particle += blockDim.x*gridDim.x) {
int index = order[particle]; int index = order[particle];
float4 delta = movedPos[particle]-posq[movedCopy*PADDED_NUM_ATOMS+index]; real4 p = movedPos[particle];
mixed4 delta = make_mixed4(p.x, p.y, p.z, p.w)-posq[movedCopy*PADDED_NUM_ATOMS+index];
for (int copy = 0; copy < NUM_COPIES; copy++) for (int copy = 0; copy < NUM_COPIES; copy++)
posq[copy*PADDED_NUM_ATOMS+index] += delta; posq[copy*PADDED_NUM_ATOMS+index] += delta;
} }
......
...@@ -14,6 +14,10 @@ FOREACH(TEST_PROG ${TEST_PROGS}) ...@@ -14,6 +14,10 @@ FOREACH(TEST_PROG ${TEST_PROGS})
# Link with shared library # Link with shared library
ADD_EXECUTABLE(${TEST_ROOT} ${TEST_PROG}) ADD_EXECUTABLE(${TEST_ROOT} ${TEST_PROG})
TARGET_LINK_LIBRARIES(${TEST_ROOT} ${SHARED_RPMD_TARGET}) TARGET_LINK_LIBRARIES(${TEST_ROOT} ${SHARED_RPMD_TARGET})
ADD_TEST(${TEST_ROOT} ${EXECUTABLE_OUTPUT_PATH}/${TEST_ROOT}) ADD_TEST(${TEST_ROOT}Single ${EXECUTABLE_OUTPUT_PATH}/${TEST_ROOT} single)
IF (OPENMM_BUILD_CUDA_DOUBLE_PRECISION_TESTS)
ADD_TEST(${TEST_ROOT}Mixed ${EXECUTABLE_OUTPUT_PATH}/${TEST_ROOT} mixed)
ADD_TEST(${TEST_ROOT}Double ${EXECUTABLE_OUTPUT_PATH}/${TEST_ROOT} double)
ENDIF(OPENMM_BUILD_CUDA_DOUBLE_PRECISION_TESTS)
ENDFOREACH(TEST_PROG ${TEST_PROGS}) ENDFOREACH(TEST_PROG ${TEST_PROGS})
...@@ -165,7 +165,6 @@ void testParaHydrogen() { ...@@ -165,7 +165,6 @@ void testParaHydrogen() {
vector<int> counts(numBins, 0); vector<int> counts(numBins, 0);
const double invBoxSize = 1.0/boxSize; const double invBoxSize = 1.0/boxSize;
double meanKE = 0.0; double meanKE = 0.0;
const RealOpenMM hbar = 1.054571628e-34*AVOGADRO/(1000*1e-12);
for (int step = 0; step < numSteps; step++) { for (int step = 0; step < numSteps; step++) {
integ.step(20); integ.step(20);
vector<State> states(numCopies); vector<State> states(numCopies);
...@@ -221,9 +220,11 @@ void testParaHydrogen() { ...@@ -221,9 +220,11 @@ void testParaHydrogen() {
ASSERT_USUALLY_EQUAL_TOL(60.0, 1.5*temperature+meanKE, 0.02); ASSERT_USUALLY_EQUAL_TOL(60.0, 1.5*temperature+meanKE, 0.02);
} }
int main() { int main(int argc, char* argv[]) {
try { try {
Platform::loadPluginsFromDirectory(Platform::getDefaultPluginsDirectory()); Platform::loadPluginsFromDirectory(Platform::getDefaultPluginsDirectory());
if (argc > 1)
Platform::getPlatformByName("CUDA").setPropertyDefaultValue("CudaPrecision", string(argv[1]));
testFreeParticles(); testFreeParticles();
testParaHydrogen(); testParaHydrogen();
} }
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2011 Stanford University and the Authors. * * Portions copyright (c) 2011-2012 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -49,6 +49,7 @@ OpenCLIntegrateRPMDStepKernel::~OpenCLIntegrateRPMDStepKernel() { ...@@ -49,6 +49,7 @@ OpenCLIntegrateRPMDStepKernel::~OpenCLIntegrateRPMDStepKernel() {
if (velocities != NULL) if (velocities != NULL)
delete velocities; delete velocities;
} }
void OpenCLIntegrateRPMDStepKernel::initialize(const System& system, const RPMDIntegrator& integrator) { void OpenCLIntegrateRPMDStepKernel::initialize(const System& system, const RPMDIntegrator& integrator) {
cl.getPlatformData().initializeContexts(system); cl.getPlatformData().initializeContexts(system);
numCopies = integrator.getNumCopies(); numCopies = integrator.getNumCopies();
...@@ -59,20 +60,34 @@ void OpenCLIntegrateRPMDStepKernel::initialize(const System& system, const RPMDI ...@@ -59,20 +60,34 @@ void OpenCLIntegrateRPMDStepKernel::initialize(const System& system, const RPMDI
if (numCopies != OpenCLFFT3D::findLegalDimension(numCopies)) if (numCopies != OpenCLFFT3D::findLegalDimension(numCopies))
throw OpenMMException("RPMDIntegrator: the number of copies must be a multiple of powers of 2, 3, and 5."); throw OpenMMException("RPMDIntegrator: the number of copies must be a multiple of powers of 2, 3, and 5.");
int paddedParticles = cl.getPaddedNumAtoms(); int paddedParticles = cl.getPaddedNumAtoms();
forces = OpenCLArray::create<mm_float4>(cl, numCopies*paddedParticles, "rpmdForces"); int forceElementSize = (cl.getUseDoublePrecision() ? sizeof(mm_double4) : sizeof(mm_float4));
positions = OpenCLArray::create<mm_float4>(cl, numCopies*paddedParticles, "rpmdPositions"); forces = new OpenCLArray(cl, numCopies*paddedParticles, forceElementSize, "rpmdForces");
velocities = OpenCLArray::create<mm_float4>(cl, numCopies*paddedParticles, "rpmdVelocities"); bool useDoublePrecision = (cl.getUseDoublePrecision() || cl.getUseMixedPrecision());
int elementSize = (useDoublePrecision ? sizeof(mm_double4) : sizeof(mm_float4));
positions = new OpenCLArray(cl, numCopies*paddedParticles, elementSize, "rpmdPositions");
velocities = new OpenCLArray(cl, numCopies*paddedParticles, elementSize, "rpmdVelocities");
cl.getIntegrationUtilities().initRandomNumberGenerator((unsigned int) integrator.getRandomNumberSeed()); cl.getIntegrationUtilities().initRandomNumberGenerator((unsigned int) integrator.getRandomNumberSeed());
// Fill in the posq and velm arrays with safe values to avoid a risk of nans. // Fill in the posq and velm arrays with safe values to avoid a risk of nans.
vector<mm_float4> temp(positions->getSize()); if (useDoublePrecision) {
for (int i = 0; i < positions->getSize(); i++) vector<mm_double4> temp(positions->getSize());
temp[i] = mm_float4(0, 0, 0, 0); for (int i = 0; i < positions->getSize(); i++)
positions->upload(temp); temp[i] = mm_double4(0, 0, 0, 0);
for (int i = 0; i < velocities->getSize(); i++) positions->upload(temp);
temp[i] = mm_float4(0, 0, 0, 1); for (int i = 0; i < velocities->getSize(); i++)
velocities->upload(temp); temp[i] = mm_double4(0, 0, 0, 1);
velocities->upload(temp);
}
else {
vector<mm_float4> temp(positions->getSize());
for (int i = 0; i < positions->getSize(); i++)
temp[i] = mm_float4(0, 0, 0, 0);
positions->upload(temp);
for (int i = 0; i < velocities->getSize(); i++)
temp[i] = mm_float4(0, 0, 0, 1);
velocities->upload(temp);
}
// Create kernels. // Create kernels.
...@@ -80,6 +95,7 @@ void OpenCLIntegrateRPMDStepKernel::initialize(const System& system, const RPMDI ...@@ -80,6 +95,7 @@ void OpenCLIntegrateRPMDStepKernel::initialize(const System& system, const RPMDI
defines["NUM_ATOMS"] = cl.intToString(cl.getNumAtoms()); defines["NUM_ATOMS"] = cl.intToString(cl.getNumAtoms());
defines["PADDED_NUM_ATOMS"] = cl.intToString(cl.getPaddedNumAtoms()); defines["PADDED_NUM_ATOMS"] = cl.intToString(cl.getPaddedNumAtoms());
defines["NUM_COPIES"] = cl.intToString(numCopies); defines["NUM_COPIES"] = cl.intToString(numCopies);
defines["THREAD_BLOCK_SIZE"] = cl.intToString(workgroupSize);
defines["HBAR"] = cl.doubleToString(1.054571628e-34*AVOGADRO/(1000*1e-12)); defines["HBAR"] = cl.doubleToString(1.054571628e-34*AVOGADRO/(1000*1e-12));
defines["SCALE"] = cl.doubleToString(1.0/sqrt((double) numCopies)); defines["SCALE"] = cl.doubleToString(1.0/sqrt((double) numCopies));
defines["M_PI"] = cl.doubleToString(M_PI); defines["M_PI"] = cl.doubleToString(M_PI);
...@@ -92,8 +108,9 @@ void OpenCLIntegrateRPMDStepKernel::initialize(const System& system, const RPMDI ...@@ -92,8 +108,9 @@ void OpenCLIntegrateRPMDStepKernel::initialize(const System& system, const RPMDI
pileKernel = cl::Kernel(program, "applyPileThermostat"); pileKernel = cl::Kernel(program, "applyPileThermostat");
stepKernel = cl::Kernel(program, "integrateStep"); stepKernel = cl::Kernel(program, "integrateStep");
velocitiesKernel = cl::Kernel(program, "advanceVelocities"); velocitiesKernel = cl::Kernel(program, "advanceVelocities");
copyToContextKernel = cl::Kernel(program, "copyToContext"); copyPositionsToContextKernel = cl::Kernel(program, "copyPositionsToContext");
copyFromContextKernel = cl::Kernel(program, "copyFromContext"); copyVelocitiesToContextKernel = cl::Kernel(program, "copyVelocitiesToContext");
copyForcesFromContextKernel = cl::Kernel(program, "copyForcesFromContext");
translateKernel = cl::Kernel(program, "applyCellTranslations"); translateKernel = cl::Kernel(program, "applyCellTranslations");
} }
...@@ -102,48 +119,56 @@ void OpenCLIntegrateRPMDStepKernel::execute(ContextImpl& context, const RPMDInte ...@@ -102,48 +119,56 @@ void OpenCLIntegrateRPMDStepKernel::execute(ContextImpl& context, const RPMDInte
if (!hasInitializedKernel) { if (!hasInitializedKernel) {
hasInitializedKernel = true; hasInitializedKernel = true;
pileKernel.setArg<cl::Buffer>(0, velocities->getDeviceBuffer()); pileKernel.setArg<cl::Buffer>(0, velocities->getDeviceBuffer());
pileKernel.setArg(1, 2*workgroupSize*sizeof(mm_float4), NULL);
pileKernel.setArg(2, 2*workgroupSize*sizeof(mm_float4), NULL);
pileKernel.setArg(3, numCopies*sizeof(mm_float2), NULL);
stepKernel.setArg<cl::Buffer>(0, positions->getDeviceBuffer()); stepKernel.setArg<cl::Buffer>(0, positions->getDeviceBuffer());
stepKernel.setArg<cl::Buffer>(1, velocities->getDeviceBuffer()); stepKernel.setArg<cl::Buffer>(1, velocities->getDeviceBuffer());
stepKernel.setArg<cl::Buffer>(2, forces->getDeviceBuffer()); stepKernel.setArg<cl::Buffer>(2, forces->getDeviceBuffer());
stepKernel.setArg(3, 2*workgroupSize*sizeof(mm_float4), NULL);
stepKernel.setArg(4, 2*workgroupSize*sizeof(mm_float4), NULL);
stepKernel.setArg(5, 2*workgroupSize*sizeof(mm_float4), NULL);
stepKernel.setArg(6, numCopies*sizeof(mm_float2), NULL);
velocitiesKernel.setArg<cl::Buffer>(0, velocities->getDeviceBuffer()); velocitiesKernel.setArg<cl::Buffer>(0, velocities->getDeviceBuffer());
velocitiesKernel.setArg<cl::Buffer>(1, forces->getDeviceBuffer()); velocitiesKernel.setArg<cl::Buffer>(1, forces->getDeviceBuffer());
translateKernel.setArg<cl::Buffer>(0, positions->getDeviceBuffer()); translateKernel.setArg<cl::Buffer>(0, positions->getDeviceBuffer());
translateKernel.setArg<cl::Buffer>(1, cl.getPosq().getDeviceBuffer()); translateKernel.setArg<cl::Buffer>(1, cl.getPosq().getDeviceBuffer());
translateKernel.setArg<cl::Buffer>(2, cl.getAtomIndexArray().getDeviceBuffer()); translateKernel.setArg<cl::Buffer>(2, cl.getAtomIndexArray().getDeviceBuffer());
copyPositionsToContextKernel.setArg<cl::Buffer>(0, positions->getDeviceBuffer());
copyPositionsToContextKernel.setArg<cl::Buffer>(1, cl.getPosq().getDeviceBuffer());
copyPositionsToContextKernel.setArg<cl::Buffer>(2, cl.getAtomIndexArray().getDeviceBuffer());
copyVelocitiesToContextKernel.setArg<cl::Buffer>(0, velocities->getDeviceBuffer());
copyVelocitiesToContextKernel.setArg<cl::Buffer>(1, cl.getVelm().getDeviceBuffer());
copyVelocitiesToContextKernel.setArg<cl::Buffer>(2, cl.getAtomIndexArray().getDeviceBuffer());
copyForcesFromContextKernel.setArg<cl::Buffer>(0, cl.getForce().getDeviceBuffer());
copyForcesFromContextKernel.setArg<cl::Buffer>(1, forces->getDeviceBuffer());
copyForcesFromContextKernel.setArg<cl::Buffer>(2, cl.getAtomIndexArray().getDeviceBuffer());
} }
// Loop over copies and compute the force on each one. // Loop over copies and compute the force on each one.
copyToContextKernel.setArg<cl::Buffer>(0, positions->getDeviceBuffer());
copyToContextKernel.setArg<cl::Buffer>(1, cl.getPosq().getDeviceBuffer());
copyToContextKernel.setArg<cl::Buffer>(2, cl.getAtomIndexArray().getDeviceBuffer());
copyFromContextKernel.setArg<cl::Buffer>(0, cl.getForce().getDeviceBuffer());
copyFromContextKernel.setArg<cl::Buffer>(1, forces->getDeviceBuffer());
copyFromContextKernel.setArg<cl::Buffer>(2, cl.getAtomIndexArray().getDeviceBuffer());
if (!forcesAreValid) if (!forcesAreValid)
computeForces(context); computeForces(context);
// Apply the PILE-L thermostat. // Apply the PILE-L thermostat.
bool useDoublePrecision = (cl.getUseDoublePrecision() || cl.getUseMixedPrecision());
const double dt = integrator.getStepSize(); const double dt = integrator.getStepSize();
pileKernel.setArg<cl_uint>(5, integration.prepareRandomNumbers(numParticles*numCopies)); pileKernel.setArg<cl_uint>(2, integration.prepareRandomNumbers(numParticles*numCopies));
pileKernel.setArg<cl::Buffer>(4, integration.getRandom().getDeviceBuffer()); // Do this *after* prepareRandomNumbers(), which might rebuild the array. pileKernel.setArg<cl::Buffer>(1, integration.getRandom().getDeviceBuffer()); // Do this *after* prepareRandomNumbers(), which might rebuild the array.
pileKernel.setArg<cl_float>(6, (cl_float) dt); if (useDoublePrecision) {
pileKernel.setArg<cl_float>(7, (cl_float) (integrator.getTemperature()*BOLTZ)); pileKernel.setArg<cl_double>(3, dt);
pileKernel.setArg<cl_float>(8, (cl_float) integrator.getFriction()); pileKernel.setArg<cl_double>(4, integrator.getTemperature()*BOLTZ);
pileKernel.setArg<cl_double>(5, integrator.getFriction());
stepKernel.setArg<cl_double>(3, dt);
stepKernel.setArg<cl_double>(4, integrator.getTemperature()*BOLTZ);
velocitiesKernel.setArg<cl_double>(2, dt);
}
else {
pileKernel.setArg<cl_float>(3, (cl_float) dt);
pileKernel.setArg<cl_float>(4, (cl_float) (integrator.getTemperature()*BOLTZ));
pileKernel.setArg<cl_float>(5, (cl_float) integrator.getFriction());
stepKernel.setArg<cl_float>(3, (cl_float) dt);
stepKernel.setArg<cl_float>(4, (cl_float) (integrator.getTemperature()*BOLTZ));
velocitiesKernel.setArg<cl_float>(2, (cl_float) dt);
}
cl.executeKernel(pileKernel, numParticles*numCopies, workgroupSize); cl.executeKernel(pileKernel, numParticles*numCopies, workgroupSize);
// Update positions and velocities. // Update positions and velocities.
stepKernel.setArg<cl_float>(7, (cl_float) dt);
stepKernel.setArg<cl_float>(8, (cl_float) (integrator.getTemperature()*BOLTZ));
cl.executeKernel(stepKernel, numParticles*numCopies, workgroupSize); cl.executeKernel(stepKernel, numParticles*numCopies, workgroupSize);
// Calculate forces based on the updated positions. // Calculate forces based on the updated positions.
...@@ -151,12 +176,11 @@ void OpenCLIntegrateRPMDStepKernel::execute(ContextImpl& context, const RPMDInte ...@@ -151,12 +176,11 @@ void OpenCLIntegrateRPMDStepKernel::execute(ContextImpl& context, const RPMDInte
computeForces(context); computeForces(context);
// Update velocities. // Update velocities.
velocitiesKernel.setArg<cl_float>(2, (cl_float) dt);
cl.executeKernel(velocitiesKernel, numParticles*numCopies, workgroupSize); cl.executeKernel(velocitiesKernel, numParticles*numCopies, workgroupSize);
// Apply the PILE-L thermostat again. // Apply the PILE-L thermostat again.
pileKernel.setArg<cl_uint>(5, integration.prepareRandomNumbers(numParticles*numCopies)); pileKernel.setArg<cl_uint>(2, integration.prepareRandomNumbers(numParticles*numCopies));
cl.executeKernel(pileKernel, numParticles*numCopies, workgroupSize); cl.executeKernel(pileKernel, numParticles*numCopies, workgroupSize);
// Update the time and step count. // Update the time and step count.
...@@ -167,11 +191,11 @@ void OpenCLIntegrateRPMDStepKernel::execute(ContextImpl& context, const RPMDInte ...@@ -167,11 +191,11 @@ void OpenCLIntegrateRPMDStepKernel::execute(ContextImpl& context, const RPMDInte
void OpenCLIntegrateRPMDStepKernel::computeForces(ContextImpl& context) { void OpenCLIntegrateRPMDStepKernel::computeForces(ContextImpl& context) {
for (int i = 0; i < numCopies; i++) { for (int i = 0; i < numCopies; i++) {
copyToContextKernel.setArg<cl_int>(3, i); copyPositionsToContextKernel.setArg<cl_int>(3, i);
cl.executeKernel(copyToContextKernel, cl.getNumAtoms()); cl.executeKernel(copyPositionsToContextKernel, cl.getNumAtoms());
context.calcForcesAndEnergy(true, false); context.calcForcesAndEnergy(true, false);
copyFromContextKernel.setArg<cl_int>(3, i); copyForcesFromContextKernel.setArg<cl_int>(3, i);
cl.executeKernel(copyFromContextKernel, cl.getNumAtoms()); cl.executeKernel(copyForcesFromContextKernel, cl.getNumAtoms());
if (cl.getAtomsWereReordered() && cl.getNonbondedUtilities().getUsePeriodic()) { if (cl.getAtomsWereReordered() && cl.getNonbondedUtilities().getUsePeriodic()) {
// Atoms may have been translated into a different periodic box, so apply // Atoms may have been translated into a different periodic box, so apply
// the same translation to all the beads. // the same translation to all the beads.
...@@ -191,11 +215,28 @@ void OpenCLIntegrateRPMDStepKernel::setPositions(int copy, const vector<Vec3>& p ...@@ -191,11 +215,28 @@ void OpenCLIntegrateRPMDStepKernel::setPositions(int copy, const vector<Vec3>& p
throw OpenMMException("RPMDIntegrator: Cannot set positions before the integrator is added to a Context"); throw OpenMMException("RPMDIntegrator: Cannot set positions before the integrator is added to a Context");
if (pos.size() != numParticles) if (pos.size() != numParticles)
throw OpenMMException("RPMDIntegrator: wrong number of values passed to setPositions()"); throw OpenMMException("RPMDIntegrator: wrong number of values passed to setPositions()");
vector<mm_float4> posq(cl.getPaddedNumAtoms()); if (cl.getUseDoublePrecision()) {
cl.getPosq().download(posq); vector<mm_double4> posq(cl.getPaddedNumAtoms());
for (int i = 0; i < numParticles; i++) cl.getPosq().download(posq);
posq[i] = mm_float4((cl_float) pos[i][0], (cl_float) pos[i][1], (cl_float) pos[i][2], posq[i].w); for (int i = 0; i < numParticles; i++)
cl.getQueue().enqueueWriteBuffer(positions->getDeviceBuffer(), CL_TRUE, copy*cl.getPaddedNumAtoms()*sizeof(mm_float4), numParticles*sizeof(mm_float4), &posq[0]); posq[i] = mm_double4(pos[i][0], pos[i][1], pos[i][2], posq[i].w);
cl.getQueue().enqueueWriteBuffer(positions->getDeviceBuffer(), CL_TRUE, copy*cl.getPaddedNumAtoms()*sizeof(mm_double4), numParticles*sizeof(mm_double4), &posq[0]);
}
else if (cl.getUseMixedPrecision()) {
vector<mm_float4> posqf(cl.getPaddedNumAtoms());
cl.getPosq().download(posqf);
vector<mm_double4> posq(cl.getPaddedNumAtoms());
for (int i = 0; i < numParticles; i++)
posq[i] = mm_double4(pos[i][0], pos[i][1], pos[i][2], posqf[i].w);
cl.getQueue().enqueueWriteBuffer(positions->getDeviceBuffer(), CL_TRUE, copy*cl.getPaddedNumAtoms()*sizeof(mm_double4), numParticles*sizeof(mm_double4), &posq[0]);
}
else {
vector<mm_float4> posq(cl.getPaddedNumAtoms());
cl.getPosq().download(posq);
for (int i = 0; i < numParticles; i++)
posq[i] = mm_float4((cl_float) pos[i][0], (cl_float) pos[i][1], (cl_float) pos[i][2], posq[i].w);
cl.getQueue().enqueueWriteBuffer(positions->getDeviceBuffer(), CL_TRUE, copy*cl.getPaddedNumAtoms()*sizeof(mm_float4), numParticles*sizeof(mm_float4), &posq[0]);
}
} }
void OpenCLIntegrateRPMDStepKernel::setVelocities(int copy, const vector<Vec3>& vel) { void OpenCLIntegrateRPMDStepKernel::setVelocities(int copy, const vector<Vec3>& vel) {
...@@ -203,22 +244,27 @@ void OpenCLIntegrateRPMDStepKernel::setVelocities(int copy, const vector<Vec3>& ...@@ -203,22 +244,27 @@ void OpenCLIntegrateRPMDStepKernel::setVelocities(int copy, const vector<Vec3>&
throw OpenMMException("RPMDIntegrator: Cannot set velocities before the integrator is added to a Context"); throw OpenMMException("RPMDIntegrator: Cannot set velocities before the integrator is added to a Context");
if (vel.size() != numParticles) if (vel.size() != numParticles)
throw OpenMMException("RPMDIntegrator: wrong number of values passed to setVelocities()"); throw OpenMMException("RPMDIntegrator: wrong number of values passed to setVelocities()");
vector<mm_float4> velm(cl.getPaddedNumAtoms()); if (cl.getUseDoublePrecision() || cl.getUseMixedPrecision()) {
cl.getVelm().download(velm); vector<mm_double4> velm(cl.getPaddedNumAtoms());
for (int i = 0; i < numParticles; i++) cl.getVelm().download(velm);
velm[i] = mm_float4((cl_float) vel[i][0], (cl_float) vel[i][1], (cl_float) vel[i][2], velm[i].w); for (int i = 0; i < numParticles; i++)
cl.getQueue().enqueueWriteBuffer(velocities->getDeviceBuffer(), CL_TRUE, copy*cl.getPaddedNumAtoms()*sizeof(mm_float4), numParticles*sizeof(mm_float4), &velm[0]); velm[i] = mm_double4(vel[i][0], vel[i][1], vel[i][2], velm[i].w);
cl.getQueue().enqueueWriteBuffer(velocities->getDeviceBuffer(), CL_TRUE, copy*cl.getPaddedNumAtoms()*sizeof(mm_double4), numParticles*sizeof(mm_double4), &velm[0]);
}
else {
vector<mm_float4> velm(cl.getPaddedNumAtoms());
cl.getVelm().download(velm);
for (int i = 0; i < numParticles; i++)
velm[i] = mm_float4((cl_float) vel[i][0], (cl_float) vel[i][1], (cl_float) vel[i][2], velm[i].w);
cl.getQueue().enqueueWriteBuffer(velocities->getDeviceBuffer(), CL_TRUE, copy*cl.getPaddedNumAtoms()*sizeof(mm_float4), numParticles*sizeof(mm_float4), &velm[0]);
}
} }
void OpenCLIntegrateRPMDStepKernel::copyToContext(int copy, ContextImpl& context) { void OpenCLIntegrateRPMDStepKernel::copyToContext(int copy, ContextImpl& context) {
copyToContextKernel.setArg<cl::Buffer>(0, positions->getDeviceBuffer()); copyPositionsToContextKernel.setArg<cl_int>(3, copy);
copyToContextKernel.setArg<cl::Buffer>(1, cl.getPosq().getDeviceBuffer()); cl.executeKernel(copyPositionsToContextKernel, cl.getNumAtoms());
copyToContextKernel.setArg<cl::Buffer>(2, cl.getAtomIndexArray().getDeviceBuffer()); copyVelocitiesToContextKernel.setArg<cl_int>(3, copy);
copyToContextKernel.setArg<cl_int>(3, copy); cl.executeKernel(copyVelocitiesToContextKernel, cl.getNumAtoms());
cl.executeKernel(copyToContextKernel, cl.getNumAtoms());
copyToContextKernel.setArg<cl::Buffer>(0, velocities->getDeviceBuffer());
copyToContextKernel.setArg<cl::Buffer>(1, cl.getVelm().getDeviceBuffer());
cl.executeKernel(copyToContextKernel, cl.getNumAtoms());
} }
string OpenCLIntegrateRPMDStepKernel::createFFT(int size, const string& variable, bool forward) { string OpenCLIntegrateRPMDStepKernel::createFFT(int size, const string& variable, bool forward) {
...@@ -232,10 +278,10 @@ string OpenCLIntegrateRPMDStepKernel::createFFT(int size, const string& variable ...@@ -232,10 +278,10 @@ string OpenCLIntegrateRPMDStepKernel::createFFT(int size, const string& variable
string multImag = (forward ? "multiplyComplexImagPart" : "multiplyComplexImagPartConj"); string multImag = (forward ? "multiplyComplexImagPart" : "multiplyComplexImagPartConj");
source<<"{\n"; source<<"{\n";
source<<"__local float4* real0 = "<<variable<<"real;\n"; source<<"__local mixed4* real0 = "<<variable<<"real;\n";
source<<"__local float4* imag0 = "<<variable<<"imag;\n"; source<<"__local mixed4* imag0 = "<<variable<<"imag;\n";
source<<"__local float4* real1 = &temp[blockStart];\n"; source<<"__local mixed4* real1 = &temp[blockStart];\n";
source<<"__local float4* imag1 = &temp[blockStart+get_local_size(0)];\n"; source<<"__local mixed4* imag1 = &temp[blockStart+get_local_size(0)];\n";
// Factor size, generating an appropriate block of code for each factor. // Factor size, generating an appropriate block of code for each factor.
...@@ -249,39 +295,39 @@ string OpenCLIntegrateRPMDStepKernel::createFFT(int size, const string& variable ...@@ -249,39 +295,39 @@ string OpenCLIntegrateRPMDStepKernel::createFFT(int size, const string& variable
source<<"if (indexInBlock < "<<(L*m)<<") {\n"; source<<"if (indexInBlock < "<<(L*m)<<") {\n";
source<<"int i = indexInBlock;\n"; source<<"int i = indexInBlock;\n";
source<<"int j = i/"<<m<<";\n"; source<<"int j = i/"<<m<<";\n";
source<<"float4 c0r = real"<<input<<"[i];\n"; source<<"mixed4 c0r = real"<<input<<"[i];\n";
source<<"float4 c0i = imag"<<input<<"[i];\n"; source<<"mixed4 c0i = imag"<<input<<"[i];\n";
source<<"float4 c1r = real"<<input<<"[i+"<<(L*m)<<"];\n"; source<<"mixed4 c1r = real"<<input<<"[i+"<<(L*m)<<"];\n";
source<<"float4 c1i = imag"<<input<<"[i+"<<(L*m)<<"];\n"; source<<"mixed4 c1i = imag"<<input<<"[i+"<<(L*m)<<"];\n";
source<<"float4 c2r = real"<<input<<"[i+"<<(2*L*m)<<"];\n"; source<<"mixed4 c2r = real"<<input<<"[i+"<<(2*L*m)<<"];\n";
source<<"float4 c2i = imag"<<input<<"[i+"<<(2*L*m)<<"];\n"; source<<"mixed4 c2i = imag"<<input<<"[i+"<<(2*L*m)<<"];\n";
source<<"float4 c3r = real"<<input<<"[i+"<<(3*L*m)<<"];\n"; source<<"mixed4 c3r = real"<<input<<"[i+"<<(3*L*m)<<"];\n";
source<<"float4 c3i = imag"<<input<<"[i+"<<(3*L*m)<<"];\n"; source<<"mixed4 c3i = imag"<<input<<"[i+"<<(3*L*m)<<"];\n";
source<<"float4 c4r = real"<<input<<"[i+"<<(4*L*m)<<"];\n"; source<<"mixed4 c4r = real"<<input<<"[i+"<<(4*L*m)<<"];\n";
source<<"float4 c4i = imag"<<input<<"[i+"<<(4*L*m)<<"];\n"; source<<"mixed4 c4i = imag"<<input<<"[i+"<<(4*L*m)<<"];\n";
source<<"float4 d0r = c1r+c4r;\n"; source<<"mixed4 d0r = c1r+c4r;\n";
source<<"float4 d0i = c1i+c4i;\n"; source<<"mixed4 d0i = c1i+c4i;\n";
source<<"float4 d1r = c2r+c3r;\n"; source<<"mixed4 d1r = c2r+c3r;\n";
source<<"float4 d1i = c2i+c3i;\n"; source<<"mixed4 d1i = c2i+c3i;\n";
source<<"float4 d2r = "<<cl.doubleToString(sin(0.4*M_PI))<<"*(c1r-c4r);\n"; source<<"mixed4 d2r = "<<cl.doubleToString(sin(0.4*M_PI))<<"*(c1r-c4r);\n";
source<<"float4 d2i = "<<cl.doubleToString(sin(0.4*M_PI))<<"*(c1i-c4i);\n"; source<<"mixed4 d2i = "<<cl.doubleToString(sin(0.4*M_PI))<<"*(c1i-c4i);\n";
source<<"float4 d3r = "<<cl.doubleToString(sin(0.4*M_PI))<<"*(c2r-c3r);\n"; source<<"mixed4 d3r = "<<cl.doubleToString(sin(0.4*M_PI))<<"*(c2r-c3r);\n";
source<<"float4 d3i = "<<cl.doubleToString(sin(0.4*M_PI))<<"*(c2i-c3i);\n"; source<<"mixed4 d3i = "<<cl.doubleToString(sin(0.4*M_PI))<<"*(c2i-c3i);\n";
source<<"float4 d4r = d0r+d1r;\n"; source<<"mixed4 d4r = d0r+d1r;\n";
source<<"float4 d4i = d0i+d1i;\n"; source<<"mixed4 d4i = d0i+d1i;\n";
source<<"float4 d5r = "<<cl.doubleToString(0.25*sqrt(5.0))<<"*(d0r-d1r);\n"; source<<"mixed4 d5r = "<<cl.doubleToString(0.25*sqrt(5.0))<<"*(d0r-d1r);\n";
source<<"float4 d5i = "<<cl.doubleToString(0.25*sqrt(5.0))<<"*(d0i-d1i);\n"; source<<"mixed4 d5i = "<<cl.doubleToString(0.25*sqrt(5.0))<<"*(d0i-d1i);\n";
source<<"float4 d6r = c0r-0.25f*d4r;\n"; source<<"mixed4 d6r = c0r-0.25f*d4r;\n";
source<<"float4 d6i = c0i-0.25f*d4i;\n"; source<<"mixed4 d6i = c0i-0.25f*d4i;\n";
source<<"float4 d7r = d6r+d5r;\n"; source<<"mixed4 d7r = d6r+d5r;\n";
source<<"float4 d7i = d6i+d5i;\n"; source<<"mixed4 d7i = d6i+d5i;\n";
source<<"float4 d8r = d6r-d5r;\n"; source<<"mixed4 d8r = d6r-d5r;\n";
source<<"float4 d8i = d6i-d5i;\n"; source<<"mixed4 d8i = d6i-d5i;\n";
string coeff = cl.doubleToString(sin(0.2*M_PI)/sin(0.4*M_PI)); string coeff = cl.doubleToString(sin(0.2*M_PI)/sin(0.4*M_PI));
source<<"float4 d9r = "<<sign<<"*(d2i+"<<coeff<<"*d3i);\n"; source<<"mixed4 d9r = "<<sign<<"*(d2i+"<<coeff<<"*d3i);\n";
source<<"float4 d9i = "<<sign<<"*(-d2r-"<<coeff<<"*d3r);\n"; source<<"mixed4 d9i = "<<sign<<"*(-d2r-"<<coeff<<"*d3r);\n";
source<<"float4 d10r = "<<sign<<"*("<<coeff<<"*d2i-d3i);\n"; source<<"mixed4 d10r = "<<sign<<"*("<<coeff<<"*d2i-d3i);\n";
source<<"float4 d10i = "<<sign<<"*(d3r-"<<coeff<<"*d2r);\n"; source<<"mixed4 d10i = "<<sign<<"*(d3r-"<<coeff<<"*d2r);\n";
source<<"real"<<output<<"[i+4*j*"<<m<<"] = c0r+d4r;\n"; source<<"real"<<output<<"[i+4*j*"<<m<<"] = c0r+d4r;\n";
source<<"imag"<<output<<"[i+4*j*"<<m<<"] = c0i+d4i;\n"; source<<"imag"<<output<<"[i+4*j*"<<m<<"] = c0i+d4i;\n";
source<<"real"<<output<<"[i+(4*j+1)*"<<m<<"] = "<<multReal<<"(w[j*"<<size<<"/"<<(5*L)<<"], d7r+d9r, d7i+d9i);\n"; source<<"real"<<output<<"[i+(4*j+1)*"<<m<<"] = "<<multReal<<"(w[j*"<<size<<"/"<<(5*L)<<"], d7r+d9r, d7i+d9i);\n";
...@@ -302,22 +348,22 @@ string OpenCLIntegrateRPMDStepKernel::createFFT(int size, const string& variable ...@@ -302,22 +348,22 @@ string OpenCLIntegrateRPMDStepKernel::createFFT(int size, const string& variable
source<<"if (indexInBlock < "<<(L*m)<<") {\n"; source<<"if (indexInBlock < "<<(L*m)<<") {\n";
source<<"int i = indexInBlock;\n"; source<<"int i = indexInBlock;\n";
source<<"int j = i/"<<m<<";\n"; source<<"int j = i/"<<m<<";\n";
source<<"float4 c0r = real"<<input<<"[i];\n"; source<<"mixed4 c0r = real"<<input<<"[i];\n";
source<<"float4 c0i = imag"<<input<<"[i];\n"; source<<"mixed4 c0i = imag"<<input<<"[i];\n";
source<<"float4 c1r = real"<<input<<"[i+"<<(L*m)<<"];\n"; source<<"mixed4 c1r = real"<<input<<"[i+"<<(L*m)<<"];\n";
source<<"float4 c1i = imag"<<input<<"[i+"<<(L*m)<<"];\n"; source<<"mixed4 c1i = imag"<<input<<"[i+"<<(L*m)<<"];\n";
source<<"float4 c2r = real"<<input<<"[i+"<<(2*L*m)<<"];\n"; source<<"mixed4 c2r = real"<<input<<"[i+"<<(2*L*m)<<"];\n";
source<<"float4 c2i = imag"<<input<<"[i+"<<(2*L*m)<<"];\n"; source<<"mixed4 c2i = imag"<<input<<"[i+"<<(2*L*m)<<"];\n";
source<<"float4 c3r = real"<<input<<"[i+"<<(3*L*m)<<"];\n"; source<<"mixed4 c3r = real"<<input<<"[i+"<<(3*L*m)<<"];\n";
source<<"float4 c3i = imag"<<input<<"[i+"<<(3*L*m)<<"];\n"; source<<"mixed4 c3i = imag"<<input<<"[i+"<<(3*L*m)<<"];\n";
source<<"float4 d0r = c0r+c2r;\n"; source<<"mixed4 d0r = c0r+c2r;\n";
source<<"float4 d0i = c0i+c2i;\n"; source<<"mixed4 d0i = c0i+c2i;\n";
source<<"float4 d1r = c0r-c2r;\n"; source<<"mixed4 d1r = c0r-c2r;\n";
source<<"float4 d1i = c0i-c2i;\n"; source<<"mixed4 d1i = c0i-c2i;\n";
source<<"float4 d2r = c1r+c3r;\n"; source<<"mixed4 d2r = c1r+c3r;\n";
source<<"float4 d2i = c1i+c3i;\n"; source<<"mixed4 d2i = c1i+c3i;\n";
source<<"float4 d3r = "<<sign<<"*(c1i-c3i);\n"; source<<"mixed4 d3r = "<<sign<<"*(c1i-c3i);\n";
source<<"float4 d3i = "<<sign<<"*(c3r-c1r);\n"; source<<"mixed4 d3i = "<<sign<<"*(c3r-c1r);\n";
source<<"real"<<output<<"[i+3*j*"<<m<<"] = d0r+d2r;\n"; source<<"real"<<output<<"[i+3*j*"<<m<<"] = d0r+d2r;\n";
source<<"imag"<<output<<"[i+3*j*"<<m<<"] = d0i+d2i;\n"; source<<"imag"<<output<<"[i+3*j*"<<m<<"] = d0i+d2i;\n";
source<<"real"<<output<<"[i+(3*j+1)*"<<m<<"] = "<<multReal<<"(w[j*"<<size<<"/"<<(4*L)<<"], d1r+d3r, d1i+d3i);\n"; source<<"real"<<output<<"[i+(3*j+1)*"<<m<<"] = "<<multReal<<"(w[j*"<<size<<"/"<<(4*L)<<"], d1r+d3r, d1i+d3i);\n";
...@@ -336,18 +382,18 @@ string OpenCLIntegrateRPMDStepKernel::createFFT(int size, const string& variable ...@@ -336,18 +382,18 @@ string OpenCLIntegrateRPMDStepKernel::createFFT(int size, const string& variable
source<<"if (indexInBlock < "<<(L*m)<<") {\n"; source<<"if (indexInBlock < "<<(L*m)<<") {\n";
source<<"int i = indexInBlock;\n"; source<<"int i = indexInBlock;\n";
source<<"int j = i/"<<m<<";\n"; source<<"int j = i/"<<m<<";\n";
source<<"float4 c0r = real"<<input<<"[i];\n"; source<<"mixed4 c0r = real"<<input<<"[i];\n";
source<<"float4 c0i = imag"<<input<<"[i];\n"; source<<"mixed4 c0i = imag"<<input<<"[i];\n";
source<<"float4 c1r = real"<<input<<"[i+"<<(L*m)<<"];\n"; source<<"mixed4 c1r = real"<<input<<"[i+"<<(L*m)<<"];\n";
source<<"float4 c1i = imag"<<input<<"[i+"<<(L*m)<<"];\n"; source<<"mixed4 c1i = imag"<<input<<"[i+"<<(L*m)<<"];\n";
source<<"float4 c2r = real"<<input<<"[i+"<<(2*L*m)<<"];\n"; source<<"mixed4 c2r = real"<<input<<"[i+"<<(2*L*m)<<"];\n";
source<<"float4 c2i = imag"<<input<<"[i+"<<(2*L*m)<<"];\n"; source<<"mixed4 c2i = imag"<<input<<"[i+"<<(2*L*m)<<"];\n";
source<<"float4 d0r = c1r+c2r;\n"; source<<"mixed4 d0r = c1r+c2r;\n";
source<<"float4 d0i = c1i+c2i;\n"; source<<"mixed4 d0i = c1i+c2i;\n";
source<<"float4 d1r = c0r-0.5f*d0r;\n"; source<<"mixed4 d1r = c0r-0.5f*d0r;\n";
source<<"float4 d1i = c0i-0.5f*d0i;\n"; source<<"mixed4 d1i = c0i-0.5f*d0i;\n";
source<<"float4 d2r = "<<sign<<"*"<<cl.doubleToString(sin(M_PI/3.0))<<"*(c1i-c2i);\n"; source<<"mixed4 d2r = "<<sign<<"*"<<cl.doubleToString(sin(M_PI/3.0))<<"*(c1i-c2i);\n";
source<<"float4 d2i = "<<sign<<"*"<<cl.doubleToString(sin(M_PI/3.0))<<"*(c2r-c1r);\n"; source<<"mixed4 d2i = "<<sign<<"*"<<cl.doubleToString(sin(M_PI/3.0))<<"*(c2r-c1r);\n";
source<<"real"<<output<<"[i+2*j*"<<m<<"] = c0r+d0r;\n"; source<<"real"<<output<<"[i+2*j*"<<m<<"] = c0r+d0r;\n";
source<<"imag"<<output<<"[i+2*j*"<<m<<"] = c0i+d0i;\n"; source<<"imag"<<output<<"[i+2*j*"<<m<<"] = c0i+d0i;\n";
source<<"real"<<output<<"[i+(2*j+1)*"<<m<<"] = "<<multReal<<"(w[j*"<<size<<"/"<<(3*L)<<"], d1r+d2r, d1i+d2i);\n"; source<<"real"<<output<<"[i+(2*j+1)*"<<m<<"] = "<<multReal<<"(w[j*"<<size<<"/"<<(3*L)<<"], d1r+d2r, d1i+d2i);\n";
...@@ -364,10 +410,10 @@ string OpenCLIntegrateRPMDStepKernel::createFFT(int size, const string& variable ...@@ -364,10 +410,10 @@ string OpenCLIntegrateRPMDStepKernel::createFFT(int size, const string& variable
source<<"if (indexInBlock < "<<(L*m)<<") {\n"; source<<"if (indexInBlock < "<<(L*m)<<") {\n";
source<<"int i = indexInBlock;\n"; source<<"int i = indexInBlock;\n";
source<<"int j = i/"<<m<<";\n"; source<<"int j = i/"<<m<<";\n";
source<<"float4 c0r = real"<<input<<"[i];\n"; source<<"mixed4 c0r = real"<<input<<"[i];\n";
source<<"float4 c0i = imag"<<input<<"[i];\n"; source<<"mixed4 c0i = imag"<<input<<"[i];\n";
source<<"float4 c1r = real"<<input<<"[i+"<<(L*m)<<"];\n"; source<<"mixed4 c1r = real"<<input<<"[i+"<<(L*m)<<"];\n";
source<<"float4 c1i = imag"<<input<<"[i+"<<(L*m)<<"];\n"; source<<"mixed4 c1i = imag"<<input<<"[i+"<<(L*m)<<"];\n";
source<<"real"<<output<<"[i+j*"<<m<<"] = c0r+c1r;\n"; source<<"real"<<output<<"[i+j*"<<m<<"] = c0r+c1r;\n";
source<<"imag"<<output<<"[i+j*"<<m<<"] = c0i+c1i;\n"; source<<"imag"<<output<<"[i+j*"<<m<<"] = c0i+c1i;\n";
source<<"real"<<output<<"[i+(j+1)*"<<m<<"] = "<<multReal<<"(w[j*"<<size<<"/"<<(2*L)<<"], c0r-c1r, c0i-c1i);\n"; source<<"real"<<output<<"[i+(j+1)*"<<m<<"] = "<<multReal<<"(w[j*"<<size<<"/"<<(2*L)<<"], c0r-c1r, c0i-c1i);\n";
......
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2011 Stanford University and the Authors. * * Portions copyright (c) 2011-2012 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -92,7 +92,7 @@ private: ...@@ -92,7 +92,7 @@ private:
OpenCLArray* forces; OpenCLArray* forces;
OpenCLArray* positions; OpenCLArray* positions;
OpenCLArray* velocities; OpenCLArray* velocities;
cl::Kernel pileKernel, stepKernel, velocitiesKernel, copyToContextKernel, copyFromContextKernel, translateKernel; cl::Kernel pileKernel, stepKernel, velocitiesKernel, copyPositionsToContextKernel, copyVelocitiesToContextKernel, copyForcesFromContextKernel, translateKernel;
}; };
} // namespace OpenMM } // namespace OpenMM
......
float4 multiplyComplexRealPart(float2 c1, float4 c2r, float4 c2i) { mixed4 multiplyComplexRealPart(mixed2 c1, mixed4 c2r, mixed4 c2i) {
return c1.x*c2r-c1.y*c2i; return c1.x*c2r-c1.y*c2i;
} }
float4 multiplyComplexImagPart(float2 c1, float4 c2r, float4 c2i) { mixed4 multiplyComplexImagPart(mixed2 c1, mixed4 c2r, mixed4 c2i) {
return c1.x*c2i+c1.y*c2r; return c1.x*c2i+c1.y*c2r;
} }
float4 multiplyComplexRealPartConj(float2 c1, float4 c2r, float4 c2i) { mixed4 multiplyComplexRealPartConj(mixed2 c1, mixed4 c2r, mixed4 c2i) {
return c1.x*c2r+c1.y*c2i; return c1.x*c2r+c1.y*c2i;
} }
float4 multiplyComplexImagPartConj(float2 c1, float4 c2r, float4 c2i) { mixed4 multiplyComplexImagPartConj(mixed2 c1, mixed4 c2r, mixed4 c2i) {
return c1.x*c2i-c1.y*c2r; return c1.x*c2i-c1.y*c2r;
} }
/** /**
* Apply the PILE-L thermostat. * Apply the PILE-L thermostat.
*/ */
__kernel void applyPileThermostat(__global float4* velm, __local float4* v, __local float4* temp, __local float2* w, __global float4* random, unsigned int randomIndex, __kernel void applyPileThermostat(__global mixed4* velm, __global float4* random, unsigned int randomIndex,
float dt, float kT, float friction) { mixed dt, mixed kT, mixed friction) {
const int numBlocks = get_global_size(0)/NUM_COPIES; const int numBlocks = get_global_size(0)/NUM_COPIES;
const int blockStart = NUM_COPIES*(get_local_id(0)/NUM_COPIES); const int blockStart = NUM_COPIES*(get_local_id(0)/NUM_COPIES);
const int indexInBlock = get_local_id(0)-blockStart; const int indexInBlock = get_local_id(0)-blockStart;
const float nkT = NUM_COPIES*kT; const mixed nkT = NUM_COPIES*kT;
const float twown = 2.0f*nkT/HBAR; const mixed twown = 2.0f*nkT/HBAR;
const float c1_0 = EXP(-0.5f*dt*friction); const mixed c1_0 = exp(-0.5f*dt*friction);
const float c2_0 = SQRT(1.0f-c1_0*c1_0); const mixed c2_0 = sqrt(1.0f-c1_0*c1_0);
__local float4* vreal = &v[blockStart]; __local mixed4 v[2*THREAD_BLOCK_SIZE];
__local float4* vimag = &v[blockStart+get_local_size(0)]; __local mixed4 temp[2*THREAD_BLOCK_SIZE];
__local mixed2 w[NUM_COPIES];
__local mixed4* vreal = &v[blockStart];
__local mixed4* vimag = &v[blockStart+get_local_size(0)];
if (get_local_id(0) < NUM_COPIES) if (get_local_id(0) < NUM_COPIES)
w[indexInBlock] = (float2) (cos(-indexInBlock*2*M_PI/NUM_COPIES), sin(-indexInBlock*2*M_PI/NUM_COPIES)); w[indexInBlock] = (mixed2) (cos(-indexInBlock*2*M_PI/NUM_COPIES), sin(-indexInBlock*2*M_PI/NUM_COPIES));
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
randomIndex += NUM_COPIES*(get_global_id(0)/NUM_COPIES); randomIndex += NUM_COPIES*(get_global_id(0)/NUM_COPIES);
for (int particle = get_global_id(0)/NUM_COPIES; particle < NUM_ATOMS; particle += numBlocks) { for (int particle = get_global_id(0)/NUM_COPIES; particle < NUM_ATOMS; particle += numBlocks) {
float4 particleVelm = velm[particle+indexInBlock*PADDED_NUM_ATOMS]; mixed4 particleVelm = velm[particle+indexInBlock*PADDED_NUM_ATOMS];
float invMass = particleVelm.w; mixed invMass = particleVelm.w;
float c3_0 = c2_0*SQRT(nkT*invMass); mixed c3_0 = c2_0*sqrt(nkT*invMass);
// Forward FFT. // Forward FFT.
vreal[indexInBlock] = SCALE*particleVelm; vreal[indexInBlock] = SCALE*particleVelm;
vimag[indexInBlock] = (float4) (0.0f, 0.0f, 0.0f, 0.0f); vimag[indexInBlock] = (mixed4) (0.0f, 0.0f, 0.0f, 0.0f);
barrier(CLK_GLOBAL_MEM_FENCE); barrier(CLK_GLOBAL_MEM_FENCE);
FFT_V_FORWARD FFT_V_FORWARD
...@@ -49,19 +52,19 @@ __kernel void applyPileThermostat(__global float4* velm, __local float4* v, __lo ...@@ -49,19 +52,19 @@ __kernel void applyPileThermostat(__global float4* velm, __local float4* v, __lo
if (indexInBlock == 0) { if (indexInBlock == 0) {
// Apply a local Langevin thermostat to the centroid mode. // Apply a local Langevin thermostat to the centroid mode.
vreal[0].xyz = vreal[0].xyz*c1_0 + c3_0*random[randomIndex].xyz; vreal[0].xyz = vreal[0].xyz*c1_0 + c3_0*convert_mixed4(random[randomIndex]).xyz;
} }
else { else {
// Use critical damping white noise for the remaining modes. // Use critical damping white noise for the remaining modes.
int k = (indexInBlock <= NUM_COPIES/2 ? indexInBlock : NUM_COPIES-indexInBlock); int k = (indexInBlock <= NUM_COPIES/2 ? indexInBlock : NUM_COPIES-indexInBlock);
const bool isCenter = (NUM_COPIES%2 == 0 && k == NUM_COPIES/2); const bool isCenter = (NUM_COPIES%2 == 0 && k == NUM_COPIES/2);
const float wk = twown*sin(k*M_PI/NUM_COPIES); const mixed wk = twown*sin(k*M_PI/NUM_COPIES);
const float c1 = EXP(-wk*dt); const mixed c1 = exp(-wk*dt);
const float c2 = SQRT((1.0f-c1*c1)/2.0f) * (isCenter ? sqrt(2.0f) : 1.0f); const mixed c2 = sqrt((1.0f-c1*c1)/2.0f) * (isCenter ? sqrt(2.0f) : 1.0f);
const float c3 = c2*SQRT(nkT*invMass); const mixed c3 = c2*sqrt(nkT*invMass);
float4 rand1 = c3*random[randomIndex+k]; mixed4 rand1 = c3*convert_mixed4(random[randomIndex+k]);
float4 rand2 = (isCenter ? 0.0f : c3*random[randomIndex+NUM_COPIES-k]); mixed4 rand2 = (isCenter ? 0.0f : c3*convert_mixed4(random[randomIndex+NUM_COPIES-k]));
vreal[indexInBlock].xyz = c1*vreal[indexInBlock].xyz + rand1.xyz; vreal[indexInBlock].xyz = c1*vreal[indexInBlock].xyz + rand1.xyz;
vimag[indexInBlock].xyz = c1*vimag[indexInBlock].xyz + (indexInBlock < NUM_COPIES/2 ? rand2.xyz : -rand2.xyz); vimag[indexInBlock].xyz = c1*vimag[indexInBlock].xyz + (indexInBlock < NUM_COPIES/2 ? rand2.xyz : -rand2.xyz);
} }
...@@ -78,42 +81,45 @@ __kernel void applyPileThermostat(__global float4* velm, __local float4* v, __lo ...@@ -78,42 +81,45 @@ __kernel void applyPileThermostat(__global float4* velm, __local float4* v, __lo
/** /**
* Advance the positions and velocities. * Advance the positions and velocities.
*/ */
__kernel void integrateStep(__global float4* posq, __global float4* velm, __global float4* force, __kernel void integrateStep(__global mixed4* posq, __global mixed4* velm, __global real4* force, mixed dt, mixed kT) {
__local float4* q, __local float4* v, __local float4* temp, __local float2* w, float dt, float kT) {
const int numBlocks = get_global_size(0)/NUM_COPIES; const int numBlocks = get_global_size(0)/NUM_COPIES;
const int blockStart = NUM_COPIES*(get_local_id(0)/NUM_COPIES); const int blockStart = NUM_COPIES*(get_local_id(0)/NUM_COPIES);
const int indexInBlock = get_local_id(0)-blockStart; const int indexInBlock = get_local_id(0)-blockStart;
const float nkT = NUM_COPIES*kT; const mixed nkT = NUM_COPIES*kT;
const float twown = 2.0f*nkT/HBAR; const mixed twown = 2.0f*nkT/HBAR;
__local mixed4 q[2*THREAD_BLOCK_SIZE];
__local mixed4 v[2*THREAD_BLOCK_SIZE];
__local mixed4 temp[2*THREAD_BLOCK_SIZE];
__local mixed2 w[NUM_COPIES];
// Update velocities. // Update velocities.
for (int particle = get_global_id(0)/NUM_COPIES; particle < NUM_ATOMS; particle += numBlocks) { for (int particle = get_global_id(0)/NUM_COPIES; particle < NUM_ATOMS; particle += numBlocks) {
int index = particle+indexInBlock*PADDED_NUM_ATOMS; int index = particle+indexInBlock*PADDED_NUM_ATOMS;
float4 particleVelm = velm[index]; mixed4 particleVelm = velm[index];
particleVelm.xyz += force[index].xyz*(0.5f*dt*particleVelm.w); particleVelm.xyz += convert_mixed4(force[index]).xyz*(0.5f*dt*particleVelm.w);
velm[index] = particleVelm; velm[index] = particleVelm;
} }
// Evolve the free ring polymer by transforming to the frequency domain. // Evolve the free ring polymer by transforming to the frequency domain.
__local float4* qreal = &q[blockStart]; __local mixed4* qreal = &q[blockStart];
__local float4* qimag = &q[blockStart+get_local_size(0)]; __local mixed4* qimag = &q[blockStart+get_local_size(0)];
__local float4* vreal = &v[blockStart]; __local mixed4* vreal = &v[blockStart];
__local float4* vimag = &v[blockStart+get_local_size(0)]; __local mixed4* vimag = &v[blockStart+get_local_size(0)];
if (get_local_id(0) < NUM_COPIES) if (get_local_id(0) < NUM_COPIES)
w[indexInBlock] = (float2) (cos(-indexInBlock*2*M_PI/NUM_COPIES), sin(-indexInBlock*2*M_PI/NUM_COPIES)); w[indexInBlock] = (mixed2) (cos(-indexInBlock*2*M_PI/NUM_COPIES), sin(-indexInBlock*2*M_PI/NUM_COPIES));
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
for (int particle = get_global_id(0)/NUM_COPIES; particle < NUM_ATOMS; particle += numBlocks) { for (int particle = get_global_id(0)/NUM_COPIES; particle < NUM_ATOMS; particle += numBlocks) {
float4 particlePosq = posq[particle+indexInBlock*PADDED_NUM_ATOMS]; mixed4 particlePosq = posq[particle+indexInBlock*PADDED_NUM_ATOMS];
float4 particleVelm = velm[particle+indexInBlock*PADDED_NUM_ATOMS]; mixed4 particleVelm = velm[particle+indexInBlock*PADDED_NUM_ATOMS];
// Forward FFT. // Forward FFT.
qreal[indexInBlock] = SCALE*particlePosq; qreal[indexInBlock] = SCALE*particlePosq;
qimag[indexInBlock] = (float4) (0.0f, 0.0f, 0.0f, 0.0f); qimag[indexInBlock] = (mixed4) (0.0f, 0.0f, 0.0f, 0.0f);
vreal[indexInBlock] = SCALE*particleVelm; vreal[indexInBlock] = SCALE*particleVelm;
vimag[indexInBlock] = (float4) (0.0f, 0.0f, 0.0f, 0.0f); vimag[indexInBlock] = (mixed4) (0.0f, 0.0f, 0.0f, 0.0f);
barrier(CLK_GLOBAL_MEM_FENCE); barrier(CLK_GLOBAL_MEM_FENCE);
FFT_Q_FORWARD FFT_Q_FORWARD
FFT_V_FORWARD FFT_V_FORWARD
...@@ -125,12 +131,12 @@ __kernel void integrateStep(__global float4* posq, __global float4* velm, __glob ...@@ -125,12 +131,12 @@ __kernel void integrateStep(__global float4* posq, __global float4* velm, __glob
qimag[0].xyz += vimag[0].xyz*dt; qimag[0].xyz += vimag[0].xyz*dt;
} }
else { else {
const float wk = twown*sin(indexInBlock*M_PI/NUM_COPIES); const mixed wk = twown*sin(indexInBlock*M_PI/NUM_COPIES);
const float wt = wk*dt; const mixed wt = wk*dt;
const float coswt = cos(wt); const mixed coswt = cos(wt);
const float sinwt = sin(wt); const mixed sinwt = sin(wt);
const float4 vprimereal = vreal[indexInBlock]*coswt - qreal[indexInBlock]*(wk*sinwt); // Advance velocity from t to t+dt const mixed4 vprimereal = vreal[indexInBlock]*coswt - qreal[indexInBlock]*(wk*sinwt); // Advance velocity from t to t+dt
const float4 vprimeimag = vimag[indexInBlock]*coswt - qimag[indexInBlock]*(wk*sinwt); const mixed4 vprimeimag = vimag[indexInBlock]*coswt - qimag[indexInBlock]*(wk*sinwt);
qreal[indexInBlock] = vreal[indexInBlock]*(sinwt/wk) + qreal[indexInBlock]*coswt; // Advance position from t to t+dt qreal[indexInBlock] = vreal[indexInBlock]*(sinwt/wk) + qreal[indexInBlock]*coswt; // Advance position from t to t+dt
qimag[indexInBlock] = vimag[indexInBlock]*(sinwt/wk) + qimag[indexInBlock]*coswt; qimag[indexInBlock] = vimag[indexInBlock]*(sinwt/wk) + qimag[indexInBlock]*coswt;
vreal[indexInBlock] = vprimereal; vreal[indexInBlock] = vprimereal;
...@@ -150,7 +156,7 @@ __kernel void integrateStep(__global float4* posq, __global float4* velm, __glob ...@@ -150,7 +156,7 @@ __kernel void integrateStep(__global float4* posq, __global float4* velm, __glob
/** /**
* Advance the velocities by a half step. * Advance the velocities by a half step.
*/ */
__kernel void advanceVelocities(__global float4* velm, __global float4* force, float dt) { __kernel void advanceVelocities(__global mixed4* velm, __global real4* force, mixed dt) {
const int numBlocks = get_global_size(0)/NUM_COPIES; const int numBlocks = get_global_size(0)/NUM_COPIES;
const int blockStart = NUM_COPIES*(get_local_id(0)/NUM_COPIES); const int blockStart = NUM_COPIES*(get_local_id(0)/NUM_COPIES);
const int indexInBlock = get_local_id(0)-blockStart; const int indexInBlock = get_local_id(0)-blockStart;
...@@ -159,16 +165,26 @@ __kernel void advanceVelocities(__global float4* velm, __global float4* force, f ...@@ -159,16 +165,26 @@ __kernel void advanceVelocities(__global float4* velm, __global float4* force, f
for (int particle = get_global_id(0)/NUM_COPIES; particle < NUM_ATOMS; particle += numBlocks) { for (int particle = get_global_id(0)/NUM_COPIES; particle < NUM_ATOMS; particle += numBlocks) {
int index = particle+indexInBlock*PADDED_NUM_ATOMS; int index = particle+indexInBlock*PADDED_NUM_ATOMS;
float4 particleVelm = velm[index]; mixed4 particleVelm = velm[index];
particleVelm.xyz += force[index].xyz*(0.5f*dt*particleVelm.w); particleVelm.xyz += convert_mixed4(force[index]).xyz*(0.5f*dt*particleVelm.w);
velm[index] = particleVelm; velm[index] = particleVelm;
} }
} }
/** /**
* Copy a set of per-atom values from the integrator's arrays to the context. * Copy a set of positions from the integrator's arrays to the context.
*/ */
__kernel void copyToContext(__global float4* src, __global float4* dst, __global int* order, int copy) { __kernel void copyPositionsToContext(__global mixed4* src, __global real4* dst, __global int* order, int copy) {
const int base = copy*PADDED_NUM_ATOMS;
for (int particle = get_global_id(0); particle < NUM_ATOMS; particle += get_global_size(0)) {
dst[particle] = convert_real4(src[base+order[particle]]);
}
}
/**
* Copy a set of velocities from the integrator's arrays to the context.
*/
__kernel void copyVelocitiesToContext(__global mixed4* src, __global mixed4* dst, __global int* order, int copy) {
const int base = copy*PADDED_NUM_ATOMS; const int base = copy*PADDED_NUM_ATOMS;
for (int particle = get_global_id(0); particle < NUM_ATOMS; particle += get_global_size(0)) { for (int particle = get_global_id(0); particle < NUM_ATOMS; particle += get_global_size(0)) {
dst[particle] = src[base+order[particle]]; dst[particle] = src[base+order[particle]];
...@@ -176,9 +192,9 @@ __kernel void copyToContext(__global float4* src, __global float4* dst, __global ...@@ -176,9 +192,9 @@ __kernel void copyToContext(__global float4* src, __global float4* dst, __global
} }
/** /**
* Copy a set of per-atom values from the context to the integrator's arrays. * Copy a set forces from the context to the integrator's arrays.
*/ */
__kernel void copyFromContext(__global float4* src, __global float4* dst, __global int* order, int copy) { __kernel void copyForcesFromContext(__global real4* src, __global real4* dst, __global int* order, int copy) {
const int base = copy*PADDED_NUM_ATOMS; const int base = copy*PADDED_NUM_ATOMS;
for (int particle = get_global_id(0); particle < NUM_ATOMS; particle += get_global_size(0)) { for (int particle = get_global_id(0); particle < NUM_ATOMS; particle += get_global_size(0)) {
dst[base+order[particle]] = src[particle]; dst[base+order[particle]] = src[particle];
...@@ -188,10 +204,10 @@ __kernel void copyFromContext(__global float4* src, __global float4* dst, __glob ...@@ -188,10 +204,10 @@ __kernel void copyFromContext(__global float4* src, __global float4* dst, __glob
/** /**
* Update atom positions so all copies are offset by the same number of periodic box widths. * Update atom positions so all copies are offset by the same number of periodic box widths.
*/ */
__kernel void applyCellTranslations(__global float4* posq, __global float4* movedPos, __global int* order, int movedCopy) { __kernel void applyCellTranslations(__global mixed4* posq, __global real4* movedPos, __global int* order, int movedCopy) {
for (int particle = get_global_id(0); particle < NUM_ATOMS; particle += get_global_size(0)) { for (int particle = get_global_id(0); particle < NUM_ATOMS; particle += get_global_size(0)) {
int index = order[particle]; int index = order[particle];
float4 delta = movedPos[particle]-posq[movedCopy*PADDED_NUM_ATOMS+index]; mixed4 delta = convert_mixed4(movedPos[particle])-posq[movedCopy*PADDED_NUM_ATOMS+index];
for (int copy = 0; copy < NUM_COPIES; copy++) for (int copy = 0; copy < NUM_COPIES; copy++)
posq[copy*PADDED_NUM_ATOMS+index] += delta; posq[copy*PADDED_NUM_ATOMS+index] += delta;
} }
......
...@@ -14,6 +14,10 @@ FOREACH(TEST_PROG ${TEST_PROGS}) ...@@ -14,6 +14,10 @@ FOREACH(TEST_PROG ${TEST_PROGS})
# Link with shared library # Link with shared library
ADD_EXECUTABLE(${TEST_ROOT} ${TEST_PROG}) ADD_EXECUTABLE(${TEST_ROOT} ${TEST_PROG})
TARGET_LINK_LIBRARIES(${TEST_ROOT} ${SHARED_RPMD_TARGET}) TARGET_LINK_LIBRARIES(${TEST_ROOT} ${SHARED_RPMD_TARGET})
ADD_TEST(${TEST_ROOT} ${EXECUTABLE_OUTPUT_PATH}/${TEST_ROOT}) ADD_TEST(${TEST_ROOT}Single ${EXECUTABLE_OUTPUT_PATH}/${TEST_ROOT} single)
IF (OPENMM_BUILD_CUDA_DOUBLE_PRECISION_TESTS)
ADD_TEST(${TEST_ROOT}Mixed ${EXECUTABLE_OUTPUT_PATH}/${TEST_ROOT} mixed)
ADD_TEST(${TEST_ROOT}Double ${EXECUTABLE_OUTPUT_PATH}/${TEST_ROOT} double)
ENDIF(OPENMM_BUILD_CUDA_DOUBLE_PRECISION_TESTS)
ENDFOREACH(TEST_PROG ${TEST_PROGS}) ENDFOREACH(TEST_PROG ${TEST_PROGS})
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2011 Stanford University and the Authors. * * Portions copyright (c) 2011-2012 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -221,9 +221,11 @@ void testParaHydrogen() { ...@@ -221,9 +221,11 @@ void testParaHydrogen() {
ASSERT_USUALLY_EQUAL_TOL(60.0, 1.5*temperature+meanKE, 0.02); ASSERT_USUALLY_EQUAL_TOL(60.0, 1.5*temperature+meanKE, 0.02);
} }
int main() { int main(int argc, char* argv[]) {
try { try {
Platform::loadPluginsFromDirectory(Platform::getDefaultPluginsDirectory()); Platform::loadPluginsFromDirectory(Platform::getDefaultPluginsDirectory());
if (argc > 1)
Platform::getPlatformByName("OpenCL").setPropertyDefaultValue("OpenCLPrecision", string(argv[1]));
testFreeParticles(); testFreeParticles();
testParaHydrogen(); testParaHydrogen();
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment