Commit 19d2885a authored by Lee-Ping's avatar Lee-Ping
Browse files

Merge github.com:SimTk/openmm

parents 99ef4344 57a6768e
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2014 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include "CpuBondForce.h"
#include "openmm/OpenMMException.h"
using namespace OpenMM;
using namespace std;
class CpuBondForce::ComputeForceTask : public ThreadPool::Task {
public:
ComputeForceTask(CpuBondForce& owner, vector<RealVec>& atomCoordinates, RealOpenMM** parameters, vector<RealVec>& forces,
vector<RealOpenMM>& threadEnergy, RealOpenMM* totalEnergy, ReferenceBondIxn& referenceBondIxn) : owner(owner), atomCoordinates(atomCoordinates),
parameters(parameters), forces(forces), threadEnergy(threadEnergy), totalEnergy(totalEnergy), referenceBondIxn(referenceBondIxn) {
}
void execute(ThreadPool& threads, int threadIndex) {
RealOpenMM* energy = (totalEnergy == NULL ? NULL : &threadEnergy[threadIndex]);
owner.threadComputeForce(threads, threadIndex, atomCoordinates, parameters, forces, energy, referenceBondIxn);
}
CpuBondForce& owner;
vector<RealVec>& atomCoordinates;
RealOpenMM** parameters;
vector<RealVec>& forces;
vector<RealOpenMM>& threadEnergy;
RealOpenMM* totalEnergy;
ReferenceBondIxn& referenceBondIxn;
};
CpuBondForce::CpuBondForce() {
}
void CpuBondForce::initialize(int numAtoms, int numBonds, int numAtomsPerBond, int** bondAtoms, ThreadPool& threads) {
this->numBonds = numBonds;
this->numAtomsPerBond = numAtomsPerBond;
this->bondAtoms = bondAtoms;
this->threads = &threads;
int numThreads = threads.getNumThreads();
int targetBondsPerThread = numBonds/numThreads;
// Record the bonds that include each atom.
vector<set<int> > atomBonds(numAtoms);
for (int bond = 0; bond < numBonds; bond++) {
for (int i = 0; i < numAtomsPerBond; i++)
atomBonds[bondAtoms[bond][i]].insert(bond);
}
// Divide bonds into groups.
vector<int> atomThread(numAtoms, -1);
vector<int> bondThread(numBonds, -1);
threadBonds.resize(numThreads);
int numProcessed = 0;
int thread = 0;
list<int> candidateBonds;
while (thread < numThreads) {
// Find the next unassigned bond.
while (numProcessed < numBonds && bondThread[numProcessed] != -1)
numProcessed++;
if (numProcessed == numBonds)
break; // We've gone through the whole list of bonds.
// See if this bond can be assigned to this thread.
if (!canAssignBond(numProcessed, thread, atomThread)) {
numProcessed++;
continue;
}
// Assign this bond to the thread.
assignBond(numProcessed++, thread, atomThread, bondThread, atomBonds, candidateBonds);
// Assign additional bonds that have been identified as involving atoms assigned to this thread.
while (!candidateBonds.empty() && threadBonds[thread].size() < targetBondsPerThread) {
int bond = *candidateBonds.begin();
if (bondThread[bond] == -1 && canAssignBond(bond, thread, atomThread))
assignBond(bond, thread, atomThread, bondThread, atomBonds, candidateBonds);
candidateBonds.pop_front();
}
// If we have assigned enough bonds to this thread, move on to the next one.
if (threadBonds[thread].size() >= targetBondsPerThread) {
candidateBonds.clear();
thread++;
}
}
// Look through the remaining bonds and see whether any of them can be assigned.
candidateBonds.clear();
for (int bond = 0; bond < numBonds; bond++) {
if (bondThread[bond] == -1) {
// See whether this bond can be assigned to a thread.
bool canAssign = true;
int assignment = -1;
for (int i = 0; i < numAtomsPerBond; i++) {
int thread = atomThread[bondAtoms[bond][i]];
if (thread == -1 || thread == assignment)
continue;
if (assignment == -1)
assignment = thread;
else {
canAssign = false;
break;
}
}
if (canAssign) {
// Assign this bond to a thread.
if (assignment == -1)
assignment = numThreads-1;
assignBond(bond, assignment, atomThread, bondThread, atomBonds, candidateBonds);
}
else {
// Add it to the list of "extra" bonds.
extraBonds.push_back(bond);
}
}
}
}
bool CpuBondForce::canAssignBond(int bond, int thread, vector<int>& atomThread) {
for (int i = 0; i < numAtomsPerBond; i++) {
int atom = bondAtoms[bond][i];
if (atomThread[atom] != -1 && atomThread[atom] != thread)
return false;
}
return true;
}
void CpuBondForce::assignBond(int bond, int thread, vector<int>& atomThread, vector<int>& bondThread, vector<set<int> >& atomBonds, list<int>& candidateBonds) {
// Assign the bond to a thread.
bondThread[bond] = thread;
threadBonds[thread].push_back(bond);
// Mark every atom in this bond as also belonging to the thread, and add all of their
// bonds to the list of candidates.
for (int i = 0; i < numAtomsPerBond; i++) {
int& atom = atomThread[bondAtoms[bond][i]];
if (atom == thread)
continue;
if (atom != -1)
throw OpenMMException("CpuBondForce: Internal error: atoms assigned to threads incorrectly");
atom = thread;
for (set<int>::const_iterator iter = atomBonds[atom].begin(); iter != atomBonds[atom].end(); ++iter)
candidateBonds.push_back(*iter);
}
}
void CpuBondForce::calculateForce(vector<RealVec>& atomCoordinates, RealOpenMM** parameters, vector<RealVec>& forces,
RealOpenMM* totalEnergy, ReferenceBondIxn& referenceBondIxn) {
// Have the worker threads compute their forces.
vector<RealOpenMM> threadEnergy(threads->getNumThreads(), 0);
ComputeForceTask task(*this, atomCoordinates, parameters, forces, threadEnergy, totalEnergy, referenceBondIxn);
threads->execute(task);
threads->waitForThreads();
// Compute any "extra" bonds.
for (int i = 0; i < extraBonds.size(); i++) {
int bond = extraBonds[i];
referenceBondIxn.calculateBondIxn(bondAtoms[bond], atomCoordinates, parameters[bond], forces, totalEnergy);
}
// Compute the total energy.
if (totalEnergy != NULL)
for (int i = 0; i < threads->getNumThreads(); i++)
*totalEnergy += threadEnergy[i];
}
void CpuBondForce::threadComputeForce(ThreadPool& threads, int threadIndex, vector<RealVec>& atomCoordinates, RealOpenMM** parameters, vector<RealVec>& forces,
RealOpenMM* totalEnergy, ReferenceBondIxn& referenceBondIxn) {
vector<int>& bonds = threadBonds[threadIndex];
int numBonds = bonds.size();
for (int i = 0; i < numBonds; i++) {
int bond = bonds[i];
referenceBondIxn.calculateBondIxn(bondAtoms[bond], atomCoordinates, parameters[bond], forces, totalEnergy);
}
}
\ No newline at end of file
/* Portions copyright (c) 2006-2013 Stanford University and Simbios.
* Contributors: Pande Group
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject
* to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE
* LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
* OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
* WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include "CpuGBSAOBCForce.h"
#include "SimTKOpenMMRealType.h"
#include "openmm/internal/vectorize.h"
#include "gmx_atomic.h"
#include <algorithm>
#include <cmath>
#include <cstdlib>
using namespace std;
using namespace OpenMM;
const int CpuGBSAOBCForce::NUM_TABLE_POINTS = 4096;
const float CpuGBSAOBCForce::TABLE_MIN = 0.25f;
const float CpuGBSAOBCForce::TABLE_MAX = 1.5f;
class CpuGBSAOBCForce::ComputeTask : public ThreadPool::Task {
public:
ComputeTask(CpuGBSAOBCForce& owner) : owner(owner) {
}
void execute(ThreadPool& threads, int threadIndex) {
owner.threadComputeForce(threads, threadIndex);
}
CpuGBSAOBCForce& owner;
};
CpuGBSAOBCForce::CpuGBSAOBCForce() : cutoff(false), periodic(false) {
logDX = (TABLE_MAX-TABLE_MIN)/NUM_TABLE_POINTS;
logDXInv = 1.0f/logDX;
logTable.resize(NUM_TABLE_POINTS+4);
for (int i = 0; i < NUM_TABLE_POINTS+4; i++) {
double x = TABLE_MIN+i*logDX;
logTable[i] = log(x);
}
}
void CpuGBSAOBCForce::setUseCutoff(float distance) {
cutoff = true;
cutoffDistance = distance;
}
void CpuGBSAOBCForce::setPeriodic(float* periodicBoxSize) {
periodic = true;
this->periodicBoxSize[0] = periodicBoxSize[0];
this->periodicBoxSize[1] = periodicBoxSize[1];
this->periodicBoxSize[2] = periodicBoxSize[2];
}
void CpuGBSAOBCForce::setSoluteDielectric(float dielectric) {
soluteDielectric = dielectric;
}
void CpuGBSAOBCForce::setSolventDielectric(float dielectric) {
solventDielectric = dielectric;
}
const std::vector<std::pair<float, float> >& CpuGBSAOBCForce::getParticleParameters() const {
return particleParams;
}
void CpuGBSAOBCForce::setParticleParameters(const std::vector<std::pair<float, float> >& params) {
particleParams = params;
bornRadii.resize(params.size()+3);
obcChain.resize(params.size()+3);
}
void CpuGBSAOBCForce::computeForce(const AlignedArray<float>& posq, vector<AlignedArray<float> >& threadForce, double* totalEnergy, ThreadPool& threads) {
// Record the parameters for the threads.
this->posq = &posq[0];
this->threadForce = &threadForce;
includeEnergy = (totalEnergy != NULL);
int numThreads = threads.getNumThreads();
threadEnergy.resize(numThreads);
threadBornForces.resize(numThreads);
for (int i = 0; i < numThreads; i++)
threadBornForces[i].resize(particleParams.size()+3);
gmx_atomic_t counter;
this->atomicCounter = &counter;
// Signal the threads to start running and wait for them to finish.
ComputeTask task(*this);
gmx_atomic_set(&counter, 0);
threads.execute(task);
threads.waitForThreads(); // Compute Born radii
gmx_atomic_set(&counter, 0);
threads.resumeThreads();
threads.waitForThreads(); // Compute surface area term
gmx_atomic_set(&counter, 0);
threads.resumeThreads();
threads.waitForThreads(); // First loop
gmx_atomic_set(&counter, 0);
threads.resumeThreads();
threads.waitForThreads(); // Second loop
// Combine the energies from all the threads.
if (totalEnergy != NULL) {
double energy = 0;
for (int i = 0; i < numThreads; i++)
energy += threadEnergy[i];
*totalEnergy += (float) energy;
}
}
void CpuGBSAOBCForce::threadComputeForce(ThreadPool& threads, int threadIndex) {
int numParticles = particleParams.size();
int numThreads = threads.getNumThreads();
const float dielectricOffset = 0.009;
const float alphaObc = 1.0f;
const float betaObc = 0.8f;
const float gammaObc = 4.85f;
fvec4 boxSize(periodicBoxSize[0], periodicBoxSize[1], periodicBoxSize[2], 0);
fvec4 invBoxSize((1/periodicBoxSize[0]), (1/periodicBoxSize[1]), (1/periodicBoxSize[2]), 0);
int start = (threadIndex*numParticles)/numThreads;
int end = ((threadIndex+1)*numParticles)/numThreads;
// Calculate Born radii
while (true) {
int blockStart = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 4);
if (blockStart >= numParticles)
break;
int numInBlock = min(4, numParticles-blockStart);
ivec4 blockAtomIndex(blockStart, blockStart+1, blockStart+2, blockStart+3);
float atomRadius[4], atomx[4], atomy[4], atomz[4];
int blockMask[4] = {0, 0, 0, 0};
for (int i = 0; i < numInBlock; i++) {
int atomIndex = blockStart+i;
atomRadius[i] = particleParams[atomIndex].first;
atomx[i] = posq[4*atomIndex];
atomy[i] = posq[4*atomIndex+1];
atomz[i] = posq[4*atomIndex+2];
blockMask[i] = 0xFFFFFFFF;
}
fvec4 offsetRadiusI(atomRadius);
fvec4 radiusIInverse = 1.0f/offsetRadiusI;
fvec4 x(atomx);
fvec4 y(atomy);
fvec4 z(atomz);
ivec4 mask(blockMask);
float sum[4] = {0.0f, 0.0f, 0.0f, 0.0f};
for (int atomJ = 0; atomJ < numParticles; atomJ++) {
fvec4 posJ(posq+4*atomJ);
fvec4 dx, dy, dz, r2;
getDeltaR(posJ, x, y, z, dx, dy, dz, r2, periodic, boxSize, invBoxSize);
ivec4 include = mask & (blockAtomIndex != ivec4(atomJ));
if (cutoff)
include = include & (r2 < cutoffDistance*cutoffDistance);
if (!any(include))
continue;
fvec4 r = sqrt(r2);
float scaledRadiusJ = particleParams[atomJ].second;
float scaledRadiusJ2 = scaledRadiusJ*scaledRadiusJ;
fvec4 rScaledRadiusJ = r + scaledRadiusJ;
include = include & (offsetRadiusI < rScaledRadiusJ);
fvec4 l_ij = 1.0f/max(offsetRadiusI, abs(r-scaledRadiusJ));
fvec4 u_ij = 1.0f/rScaledRadiusJ;
fvec4 l_ij2 = l_ij*l_ij;
fvec4 u_ij2 = u_ij*u_ij;
fvec4 rInverse = 1.0f/r;
fvec4 r2Inverse = rInverse*rInverse;
fvec4 logRatio = fastLog(u_ij/l_ij);
fvec4 term = l_ij - u_ij + 0.25f*r*(u_ij2 - l_ij2) + (0.5f*rInverse*logRatio) + (0.25f*scaledRadiusJ*scaledRadiusJ*rInverse)*(l_ij2 - u_ij2);
for (int j = 0; j < 4; j++) {
if (include[j]) {
sum[j] += term[j];
if (offsetRadiusI[j] < scaledRadiusJ-r[j])
sum[j] += 2.0f*(radiusIInverse[j]-l_ij[j]);
}
}
}
for (int i = 0; i < numInBlock; i++) {
int atomIndex = blockStart+i;
sum[i] *= 0.5f*atomRadius[i];
float sum2 = sum[i]*sum[i];
float sum3 = sum[i]*sum2;
float tanhSum = tanh(alphaObc*sum[i] - betaObc*sum2 + gammaObc*sum3);
float radiusI = atomRadius[i] + dielectricOffset;
bornRadii[atomIndex] = 1.0f/(1.0f/atomRadius[i] - tanhSum/radiusI);
obcChain[atomIndex] = atomRadius[i]*(alphaObc - 2.0f*betaObc*sum[i] + 3.0f*gammaObc*sum2);
obcChain[atomIndex] = (1.0f - tanhSum*tanhSum)*obcChain[atomIndex]/radiusI;
}
}
threads.syncThreads();
// Calculate ACE surface area term.
const float probeRadius = 0.14f;
const float surfaceAreaFactor = 28.3919551;
double energy = 0.0;
vector<float>& bornForces = threadBornForces[threadIndex];
for (int i = 0; i < numParticles; i++)
bornForces[i] = 0.0f;
while (true) {
int atomI = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 1);
if (atomI >= numParticles)
break;
if (bornRadii[atomI] > 0) {
float radiusI = particleParams[atomI].first + dielectricOffset;
float r = radiusI + probeRadius;
float ratio6 = powf(radiusI/bornRadii[atomI], 6.0f);
float saTerm = surfaceAreaFactor*r*r*ratio6;
energy += saTerm;
bornForces[atomI] = -6.0f*saTerm/bornRadii[atomI];
}
else
bornForces[atomI] = 0.0f;
}
threads.syncThreads();
// First loop of Born energy computation.
float* forces = &(*threadForce)[threadIndex][0];
float preFactor;
if (soluteDielectric != 0.0f && solventDielectric != 0.0f)
preFactor = ONE_4PI_EPS0*((1.0f/solventDielectric) - (1.0f/soluteDielectric));
else
preFactor = 0.0f;
while (true) {
int blockStart = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 4);
if (blockStart >= numParticles)
break;
int numInBlock = min(4, numParticles-blockStart);
ivec4 blockAtomIndex(blockStart, blockStart+1, blockStart+2, blockStart+3);
float atomCharge[4], atomx[4], atomy[4], atomz[4];
int blockMask[4] = {0, 0, 0, 0};
fvec4 blockAtomForceX(0.0f), blockAtomForceY(0.0f), blockAtomForceZ(0.0f), blockAtomBornForce(0.0f);
for (int i = 0; i < numInBlock; i++) {
int atomIndex = blockStart+i;
atomx[i] = posq[4*atomIndex];
atomy[i] = posq[4*atomIndex+1];
atomz[i] = posq[4*atomIndex+2];
atomCharge[i] = preFactor*posq[4*atomIndex+3];
blockMask[i] = 0xFFFFFFFF;
}
fvec4 radii(&bornRadii[blockStart]);
fvec4 x(atomx);
fvec4 y(atomy);
fvec4 z(atomz);
fvec4 partialChargeI(atomCharge);
ivec4 mask(blockMask);
for (int atomJ = blockStart; atomJ < numParticles; atomJ++) {
fvec4 posJ(posq+4*atomJ);
fvec4 dx, dy, dz, r2;
getDeltaR(posJ, x, y, z, dx, dy, dz, r2, periodic, boxSize, invBoxSize);
ivec4 include = mask & (blockAtomIndex <= ivec4(atomJ));
if (cutoff)
include = include & (r2 < cutoffDistance*cutoffDistance);
if (!any(include))
continue;
fvec4 r = sqrt(r2);
fvec4 alpha2_ij = radii*bornRadii[atomJ];
fvec4 D_ij = r2/(4.0f*alpha2_ij);
fvec4 expTerm(expf(-D_ij[0]), expf(-D_ij[1]), expf(-D_ij[2]), expf(-D_ij[3]));
fvec4 denominator2 = r2 + alpha2_ij*expTerm;
fvec4 denominator = sqrt(denominator2);
fvec4 Gpol = (partialChargeI*posJ[3])/denominator;
fvec4 dGpol_dr = -Gpol*(1.0f - 0.25f*expTerm)/denominator2;
fvec4 dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f + D_ij)/denominator2;
dGpol_dr = blend(0.0f, dGpol_dr, include);
dGpol_dalpha2_ij = blend(0.0f, dGpol_dalpha2_ij, include);
fvec4 fx = dx*dGpol_dr;
fvec4 fy = dy*dGpol_dr;
fvec4 fz = dz*dGpol_dr;
blockAtomForceX -= fx;
blockAtomForceY -= fy;
blockAtomForceZ -= fz;
blockAtomBornForce += dGpol_dalpha2_ij*bornRadii[atomJ];
float* atomForce = forces+4*atomJ;
fvec4 one(1.0f);
atomForce[0] += dot4(fx, one);
atomForce[1] += dot4(fy, one);
atomForce[2] += dot4(fz, one);
ivec4 atomJMask = include & (blockAtomIndex != ivec4(atomJ));
fvec4 termEnergy = blend(0.0f, Gpol, include);
if (cutoff)
termEnergy -= blend(0.0f, partialChargeI*posJ[3]/cutoffDistance, atomJMask);
termEnergy *= blend(0.5f, 1.0f, atomJMask);
energy += dot4(termEnergy, one);
bornForces[atomJ] += dot4(blend(0.0f, dGpol_dalpha2_ij, atomJMask), radii);
}
fvec4 f[4] = {blockAtomForceX, blockAtomForceY, blockAtomForceZ, 0.0f};
transpose(f[0], f[1], f[2], f[3]);
for (int i = 0; i < numInBlock; i++) {
int atomIndex = blockStart+i;
(fvec4(forces+4*atomIndex)+f[i]).store(forces+4*atomIndex);
bornForces[atomIndex] += blockAtomBornForce[i];
}
}
threads.syncThreads();
// Second loop of Born energy computation.
while (true) {
int blockStart = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 4);
if (blockStart >= numParticles)
break;
fvec4 bornForce(0.0f);
for (int i = 0; i < numThreads; i++)
bornForce += fvec4(&threadBornForces[i][blockStart]);
fvec4 radii(&bornRadii[blockStart]);
bornForce *= radii*radii*fvec4(&obcChain[blockStart]);
int numInBlock = min(4, numParticles-blockStart);
ivec4 blockAtomIndex(blockStart, blockStart+1, blockStart+2, blockStart+3);
float atomRadius[4], atomx[4], atomy[4], atomz[4];
int blockMask[4] = {0, 0, 0, 0};
fvec4 blockAtomForceX(0.0f), blockAtomForceY(0.0f), blockAtomForceZ(0.0f);
for (int i = 0; i < numInBlock; i++) {
int atomIndex = blockStart+i;
atomRadius[i] = particleParams[atomIndex].first;
atomx[i] = posq[4*atomIndex];
atomy[i] = posq[4*atomIndex+1];
atomz[i] = posq[4*atomIndex+2];
blockMask[i] = 0xFFFFFFFF;
}
fvec4 offsetRadiusI(atomRadius);
fvec4 x(atomx);
fvec4 y(atomy);
fvec4 z(atomz);
ivec4 mask(blockMask);
for (int atomJ = 0; atomJ < numParticles; atomJ++) {
fvec4 posJ(posq+4*atomJ);
fvec4 dx, dy, dz, r2;
getDeltaR(posJ, x, y, z, dx, dy, dz, r2, periodic, boxSize, invBoxSize);
ivec4 include = mask & (blockAtomIndex != ivec4(atomJ));
if (cutoff)
include = include & (r2 < cutoffDistance*cutoffDistance);
if (!any(include))
continue;
fvec4 r = sqrt(r2);
float scaledRadiusJ = particleParams[atomJ].second;
float scaledRadiusJ2 = scaledRadiusJ*scaledRadiusJ;
fvec4 rScaledRadiusJ = r + scaledRadiusJ;
include = include & (offsetRadiusI < rScaledRadiusJ);
fvec4 l_ij = 1.0f/max(offsetRadiusI, abs(r-scaledRadiusJ));
fvec4 u_ij = 1.0f/rScaledRadiusJ;
fvec4 l_ij2 = l_ij*l_ij;
fvec4 u_ij2 = u_ij*u_ij;
fvec4 rInverse = 1.0f/r;
fvec4 r2Inverse = rInverse*rInverse;
fvec4 logRatio = fastLog(u_ij/l_ij);
fvec4 t3 = 0.125f*(1.0f + scaledRadiusJ2*r2Inverse)*(l_ij2 - u_ij2) + 0.25f*logRatio*r2Inverse;
fvec4 de = bornForce*t3*rInverse;
de = blend(0.0f, de, include);
fvec4 fx = dx*de;
fvec4 fy = dy*de;
fvec4 fz = dz*de;
blockAtomForceX += fx;
blockAtomForceY += fy;
blockAtomForceZ += fz;
float* atomForce = forces+4*atomJ;
fvec4 one(1.0f);
atomForce[0] -= dot4(fx, one);
atomForce[1] -= dot4(fy, one);
atomForce[2] -= dot4(fz, one);
}
fvec4 f[4] = {blockAtomForceX, blockAtomForceY, blockAtomForceZ, 0.0f};
transpose(f[0], f[1], f[2], f[3]);
for (int i = 0; i < numInBlock; i++) {
int atomIndex = blockStart+i;
(fvec4(forces+4*atomIndex)+f[i]).store(forces+4*atomIndex);
}
}
threadEnergy[threadIndex] = energy;
}
void CpuGBSAOBCForce::getDeltaR(const fvec4& posI, const fvec4& x, const fvec4& y, const fvec4& z, fvec4& dx, fvec4& dy, fvec4& dz, fvec4& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const {
dx = x-posI[0];
dy = y-posI[1];
dz = z-posI[2];
if (periodic) {
dx -= round(dx*invBoxSize[0])*boxSize[0];
dy -= round(dy*invBoxSize[1])*boxSize[1];
dz -= round(dz*invBoxSize[2])*boxSize[2];
}
r2 = dx*dx + dy*dy + dz*dz;
}
fvec4 CpuGBSAOBCForce::fastLog(const fvec4& x) {
// Evaluate log(x) using a lookup table for speed.
if (any((x < TABLE_MIN) | (x >= TABLE_MAX)))
return fvec4(logf(x[0]), logf(x[1]), logf(x[2]), logf(x[3]));
fvec4 x1 = (x-TABLE_MIN)*logDXInv;
ivec4 index = floor(x1);
fvec4 coeff2 = x1-index;
fvec4 coeff1 = 1.0f-coeff2;
fvec4 t1(&logTable[index[0]]);
fvec4 t2(&logTable[index[1]]);
fvec4 t3(&logTable[index[2]]);
fvec4 t4(&logTable[index[3]]);
transpose(t1, t2, t3, t4);
return coeff1*t1 + coeff2*t2;
}
......@@ -38,8 +38,18 @@
using namespace OpenMM;
KernelImpl* CpuKernelFactory::createKernelImpl(std::string name, const Platform& platform, ContextImpl& context) const {
ReferencePlatform::PlatformData& data = *static_cast<ReferencePlatform::PlatformData*>(context.getPlatformData());
CpuPlatform::PlatformData& data = CpuPlatform::getPlatformData(context);
if (name == CalcForcesAndEnergyKernel::Name())
return new CpuCalcForcesAndEnergyKernel(name, platform, data, context);
if (name == CalcPeriodicTorsionForceKernel::Name())
return new CpuCalcPeriodicTorsionForceKernel(name, platform, data);
if (name == CalcRBTorsionForceKernel::Name())
return new CpuCalcRBTorsionForceKernel(name, platform, data);
if (name == CalcNonbondedForceKernel::Name())
return new CpuCalcNonbondedForceKernel(name, platform);
return new CpuCalcNonbondedForceKernel(name, platform, data);
if (name == CalcGBSAOBCForceKernel::Name())
return new CpuCalcGBSAOBCForceKernel(name, platform, data);
if (name == IntegrateLangevinStepKernel::Name())
return new CpuIntegrateLangevinStepKernel(name, platform, data);
throw OpenMMException((std::string("Tried to create kernel with illegal kernel name '") + name + "'").c_str());
}
......@@ -31,11 +31,17 @@
#include "CpuKernels.h"
#include "ReferenceBondForce.h"
#include "ReferenceConstraints.h"
#include "ReferenceKernelFactory.h"
#include "ReferenceKernels.h"
#include "ReferenceLJCoulomb14.h"
#include "ReferenceProperDihedralBond.h"
#include "ReferenceRbDihedralBond.h"
#include "openmm/Context.h"
#include "openmm/OpenMMException.h"
#include "openmm/internal/ContextImpl.h"
#include "openmm/internal/NonbondedForceImpl.h"
#include "openmm/internal/vectorize.h"
#include "RealVec.h"
using namespace OpenMM;
......@@ -61,6 +67,270 @@ static RealVec& extractBoxSize(ContextImpl& context) {
return *(RealVec*) data->periodicBoxSize;
}
static ReferenceConstraints& extractConstraints(ContextImpl& context) {
ReferencePlatform::PlatformData* data = reinterpret_cast<ReferencePlatform::PlatformData*>(context.getPlatformData());
return *(ReferenceConstraints*) data->constraints;
}
/**
* Compute the kinetic energy of the system, possibly shifting the velocities in time to account
* for a leapfrog integrator.
*/
static double computeShiftedKineticEnergy(ContextImpl& context, vector<double>& masses, double timeShift) {
vector<RealVec>& posData = extractPositions(context);
vector<RealVec>& velData = extractVelocities(context);
vector<RealVec>& forceData = extractForces(context);
int numParticles = context.getSystem().getNumParticles();
// Compute the shifted velocities.
vector<RealVec> shiftedVel(numParticles);
for (int i = 0; i < numParticles; ++i) {
if (masses[i] > 0)
shiftedVel[i] = velData[i]+forceData[i]*(timeShift/masses[i]);
else
shiftedVel[i] = velData[i];
}
// Apply constraints to them.
vector<double> inverseMasses(numParticles);
for (int i = 0; i < numParticles; i++)
inverseMasses[i] = (masses[i] == 0 ? 0 : 1/masses[i]);
extractConstraints(context).applyToVelocities(posData, shiftedVel, inverseMasses, 1e-4);
// Compute the kinetic energy.
double energy = 0.0;
for (int i = 0; i < numParticles; ++i)
if (masses[i] > 0)
energy += masses[i]*(shiftedVel[i].dot(shiftedVel[i]));
return 0.5*energy;
}
class CpuCalcForcesAndEnergyKernel::SumForceTask : public ThreadPool::Task {
public:
SumForceTask(int numParticles, vector<RealVec>& forceData, CpuPlatform::PlatformData& data) : numParticles(numParticles), forceData(forceData), data(data) {
}
void execute(ThreadPool& threads, int threadIndex) {
// Sum the contributions to forces that have been calculated by different threads.
int numThreads = threads.getNumThreads();
int start = threadIndex*numParticles/numThreads;
int end = (threadIndex+1)*numParticles/numThreads;
for (int i = start; i < end; i++) {
fvec4 f(0.0f);
for (int j = 0; j < numThreads; j++)
f += fvec4(&data.threadForce[j][4*i]);
forceData[i][0] += f[0];
forceData[i][1] += f[1];
forceData[i][2] += f[2];
}
}
int numParticles;
vector<RealVec>& forceData;
CpuPlatform::PlatformData& data;
};
class CpuCalcForcesAndEnergyKernel::InitForceTask : public ThreadPool::Task {
public:
InitForceTask(int numParticles, ContextImpl& context, CpuPlatform::PlatformData& data) : numParticles(numParticles), context(context), data(data) {
}
void execute(ThreadPool& threads, int threadIndex) {
// Convert the positions to single precision and apply periodic boundary conditions
AlignedArray<float>& posq = data.posq;
vector<RealVec>& posData = extractPositions(context);
RealVec boxSize = extractBoxSize(context);
double invBoxSize[3] = {1/boxSize[0], 1/boxSize[1], 1/boxSize[2]};
int numParticles = context.getSystem().getNumParticles();
int numThreads = threads.getNumThreads();
int start = threadIndex*numParticles/numThreads;
int end = (threadIndex+1)*numParticles/numThreads;
if (data.isPeriodic)
for (int i = start; i < end; i++)
for (int j = 0; j < 3; j++) {
RealOpenMM x = posData[i][j];
double base = floor(x*invBoxSize[j])*boxSize[j];
posq[4*i+j] = (float) (x-base);
}
else
for (int i = start; i < end; i++) {
posq[4*i] = (float) posData[i][0];
posq[4*i+1] = (float) posData[i][1];
posq[4*i+2] = (float) posData[i][2];
}
// Clear the forces.
fvec4 zero(0.0f);
for (int j = 0; j < numParticles; j++)
zero.store(&data.threadForce[threadIndex][j*4]);
}
int numParticles;
ContextImpl& context;
CpuPlatform::PlatformData& data;
};
CpuCalcForcesAndEnergyKernel::CpuCalcForcesAndEnergyKernel(std::string name, const Platform& platform, CpuPlatform::PlatformData& data, ContextImpl& context) :
CalcForcesAndEnergyKernel(name, platform), data(data) {
// Create a Reference platform version of this kernel.
ReferenceKernelFactory referenceFactory;
referenceKernel = Kernel(referenceFactory.createKernelImpl(name, platform, context));
}
void CpuCalcForcesAndEnergyKernel::initialize(const System& system) {
referenceKernel.getAs<ReferenceCalcForcesAndEnergyKernel>().initialize(system);
}
void CpuCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups) {
referenceKernel.getAs<ReferenceCalcForcesAndEnergyKernel>().beginComputation(context, includeForce, includeEnergy, groups);
// Convert positions to single precision and clear the forces.
InitForceTask task(context.getSystem().getNumParticles(), context, data);
data.threads.execute(task);
data.threads.waitForThreads();
}
double CpuCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups) {
// Sum the forces from all the threads.
SumForceTask task(context.getSystem().getNumParticles(), extractForces(context), data);
data.threads.execute(task);
data.threads.waitForThreads();
return referenceKernel.getAs<ReferenceCalcForcesAndEnergyKernel>().finishComputation(context, includeForce, includeEnergy, groups);
}
CpuCalcPeriodicTorsionForceKernel::~CpuCalcPeriodicTorsionForceKernel() {
if (torsionIndexArray != NULL) {
for (int i = 0; i < numTorsions; i++) {
delete[] torsionIndexArray[i];
delete[] torsionParamArray[i];
}
delete[] torsionIndexArray;
delete[] torsionParamArray;
}
}
void CpuCalcPeriodicTorsionForceKernel::initialize(const System& system, const PeriodicTorsionForce& force) {
numTorsions = force.getNumTorsions();
torsionIndexArray = new int*[numTorsions];
for (int i = 0; i < numTorsions; i++)
torsionIndexArray[i] = new int[4];
torsionParamArray = new RealOpenMM*[numTorsions];
for (int i = 0; i < numTorsions; i++)
torsionParamArray[i] = new RealOpenMM[3];
for (int i = 0; i < numTorsions; ++i) {
int particle1, particle2, particle3, particle4, periodicity;
double phase, k;
force.getTorsionParameters(i, particle1, particle2, particle3, particle4, periodicity, phase, k);
torsionIndexArray[i][0] = particle1;
torsionIndexArray[i][1] = particle2;
torsionIndexArray[i][2] = particle3;
torsionIndexArray[i][3] = particle4;
torsionParamArray[i][0] = (RealOpenMM) k;
torsionParamArray[i][1] = (RealOpenMM) phase;
torsionParamArray[i][2] = (RealOpenMM) periodicity;
}
bondForce.initialize(system.getNumParticles(), numTorsions, 4, torsionIndexArray, data.threads);
}
double CpuCalcPeriodicTorsionForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
vector<RealVec>& posData = extractPositions(context);
vector<RealVec>& forceData = extractForces(context);
RealOpenMM energy = 0;
ReferenceProperDihedralBond periodicTorsionBond;
bondForce.calculateForce(posData, torsionParamArray, forceData, includeEnergy ? &energy : NULL, periodicTorsionBond);
return energy;
}
void CpuCalcPeriodicTorsionForceKernel::copyParametersToContext(ContextImpl& context, const PeriodicTorsionForce& force) {
if (numTorsions != force.getNumTorsions())
throw OpenMMException("updateParametersInContext: The number of torsions has changed");
// Record the values.
for (int i = 0; i < numTorsions; ++i) {
int particle1, particle2, particle3, particle4, periodicity;
double phase, k;
force.getTorsionParameters(i, particle1, particle2, particle3, particle4, periodicity, phase, k);
if (particle1 != torsionIndexArray[i][0] || particle2 != torsionIndexArray[i][1] || particle3 != torsionIndexArray[i][2] || particle4 != torsionIndexArray[i][3])
throw OpenMMException("updateParametersInContext: The set of particles in a torsion has changed");
torsionParamArray[i][0] = (RealOpenMM) k;
torsionParamArray[i][1] = (RealOpenMM) phase;
torsionParamArray[i][2] = (RealOpenMM) periodicity;
}
}
CpuCalcRBTorsionForceKernel::~CpuCalcRBTorsionForceKernel() {
if (torsionIndexArray != NULL) {
for (int i = 0; i < numTorsions; i++) {
delete[] torsionIndexArray[i];
delete[] torsionParamArray[i];
}
delete[] torsionIndexArray;
delete[] torsionParamArray;
}
}
void CpuCalcRBTorsionForceKernel::initialize(const System& system, const RBTorsionForce& force) {
numTorsions = force.getNumTorsions();
torsionIndexArray = new int*[numTorsions];
for (int i = 0; i < numTorsions; i++)
torsionIndexArray[i] = new int[4];
torsionParamArray = new RealOpenMM*[numTorsions];
for (int i = 0; i < numTorsions; i++)
torsionParamArray[i] = new RealOpenMM[6];
for (int i = 0; i < numTorsions; ++i) {
int particle1, particle2, particle3, particle4;
double c0, c1, c2, c3, c4, c5;
force.getTorsionParameters(i, particle1, particle2, particle3, particle4, c0, c1, c2, c3, c4, c5);
torsionIndexArray[i][0] = particle1;
torsionIndexArray[i][1] = particle2;
torsionIndexArray[i][2] = particle3;
torsionIndexArray[i][3] = particle4;
torsionParamArray[i][0] = (RealOpenMM) c0;
torsionParamArray[i][1] = (RealOpenMM) c1;
torsionParamArray[i][2] = (RealOpenMM) c2;
torsionParamArray[i][3] = (RealOpenMM) c3;
torsionParamArray[i][4] = (RealOpenMM) c4;
torsionParamArray[i][5] = (RealOpenMM) c5;
}
bondForce.initialize(system.getNumParticles(), numTorsions, 4, torsionIndexArray, data.threads);
}
double CpuCalcRBTorsionForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
vector<RealVec>& posData = extractPositions(context);
vector<RealVec>& forceData = extractForces(context);
RealOpenMM energy = 0;
ReferenceRbDihedralBond rbTorsionBond;
bondForce.calculateForce(posData, torsionParamArray, forceData, includeEnergy ? &energy : NULL, rbTorsionBond);
return energy;
}
void CpuCalcRBTorsionForceKernel::copyParametersToContext(ContextImpl& context, const RBTorsionForce& force) {
if (numTorsions != force.getNumTorsions())
throw OpenMMException("updateParametersInContext: The number of torsions has changed");
// Record the values.
for (int i = 0; i < numTorsions; ++i) {
int particle1, particle2, particle3, particle4;
double c0, c1, c2, c3, c4, c5;
force.getTorsionParameters(i, particle1, particle2, particle3, particle4, c0, c1, c2, c3, c4, c5);
if (particle1 != torsionIndexArray[i][0] || particle2 != torsionIndexArray[i][1] || particle3 != torsionIndexArray[i][2] || particle4 != torsionIndexArray[i][3])
throw OpenMMException("updateParametersInContext: The set of particles in a torsion has changed");
torsionParamArray[i][0] = (RealOpenMM) c0;
torsionParamArray[i][1] = (RealOpenMM) c1;
torsionParamArray[i][2] = (RealOpenMM) c2;
torsionParamArray[i][3] = (RealOpenMM) c3;
torsionParamArray[i][4] = (RealOpenMM) c4;
torsionParamArray[i][5] = (RealOpenMM) c5;
}
}
class CpuCalcNonbondedForceKernel::PmeIO : public CalcPmeReciprocalForceKernel::IO {
public:
PmeIO(float* posq, float* force, int numParticles) : posq(posq), force(force), numParticles(numParticles) {
......@@ -81,15 +351,35 @@ private:
int numParticles;
};
bool isVec8Supported();
CpuNonbondedForce* createCpuNonbondedForceVec4();
CpuNonbondedForce* createCpuNonbondedForceVec8();
CpuCalcNonbondedForceKernel::CpuCalcNonbondedForceKernel(string name, const Platform& platform, CpuPlatform::PlatformData& data) : CalcNonbondedForceKernel(name, platform),
data(data), bonded14IndexArray(NULL), bonded14ParamArray(NULL), hasInitializedPme(false), neighborList(NULL), nonbonded(NULL) {
if (isVec8Supported()) {
neighborList = new CpuNeighborList(8);
nonbonded = createCpuNonbondedForceVec8();
}
else {
neighborList = new CpuNeighborList(4);
nonbonded = createCpuNonbondedForceVec4();
}
}
CpuCalcNonbondedForceKernel::~CpuCalcNonbondedForceKernel() {
if (bonded14ParamArray != NULL) {
for (int i = 0; i < num14; i++) {
delete[] bonded14IndexArray[i];
delete[] bonded14ParamArray[i];
}
delete bonded14IndexArray;
delete bonded14ParamArray;
delete[] bonded14IndexArray;
delete[] bonded14ParamArray;
}
if (nonbonded != NULL)
delete nonbonded;
if (neighborList != NULL)
delete neighborList;
}
void CpuCalcNonbondedForceKernel::initialize(const System& system, const NonbondedForce& force) {
......@@ -97,8 +387,6 @@ void CpuCalcNonbondedForceKernel::initialize(const System& system, const Nonbond
// Identify which exceptions are 1-4 interactions.
numParticles = force.getNumParticles();
posq.resize(4*numParticles, 0);
forces.resize(4*numParticles, 0);
exclusions.resize(numParticles);
vector<int> nb14s;
for (int i = 0; i < force.getNumExceptions(); i++) {
......@@ -125,7 +413,7 @@ void CpuCalcNonbondedForceKernel::initialize(const System& system, const Nonbond
for (int i = 0; i < numParticles; ++i) {
double charge, radius, depth;
force.getParticleParameters(i, charge, radius, depth);
posq[4*i+3] = (float) charge;
data.posq[4*i+3] = (float) charge;
particleParams[i] = make_pair((float) (0.5*radius), (float) (2.0*sqrt(depth)));
sumSquaredCharges += charge*charge;
}
......@@ -173,6 +461,7 @@ void CpuCalcNonbondedForceKernel::initialize(const System& system, const Nonbond
else
dispersionCoefficient = 0.0;
lastPositions.resize(numParticles, Vec3(1e10, 1e10, 1e10));
data.isPeriodic = (nonbondedMethod == CutoffPeriodic || nonbondedMethod == Ewald || nonbondedMethod == PME);
}
double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy, bool includeDirect, bool includeReciprocal) {
......@@ -192,32 +481,14 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
}
}
}
AlignedArray<float>& posq = data.posq;
vector<RealVec>& posData = extractPositions(context);
vector<RealVec>& forceData = extractForces(context);
RealVec boxSize = extractBoxSize(context);
float floatBoxSize[3] = {(float) boxSize[0], (float) boxSize[1], (float) boxSize[2]};
double energy = ewaldSelfEnergy;
bool periodic = (nonbondedMethod == CutoffPeriodic);
double energy = (includeReciprocal ? ewaldSelfEnergy : 0.0);
bool ewald = (nonbondedMethod == Ewald);
bool pme = (nonbondedMethod == PME);
// Convert the positions to single precision.
if (periodic)
for (int i = 0; i < numParticles; i++)
for (int j = 0; j < 3; j++) {
RealOpenMM x = posData[i][j];
double base = floor(x/boxSize[j]+0.5)*boxSize[j];
posq[4*i+j] = (float) (x-base);
}
else
for (int i = 0; i < numParticles; i++) {
posq[4*i] = (float) posData[i][0];
posq[4*i+1] = (float) posData[i][1];
posq[4*i+2] = (float) posData[i][2];
}
for (int i = 0; i < 4*numParticles; i++)
forces[i] = 0.0f;
if (nonbondedMethod != NoCutoff) {
// Determine whether we need to recompute the neighbor list.
......@@ -244,6 +515,7 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
int numMoved = moved.size();
double cutoff2 = nonbondedCutoff*nonbondedCutoff;
double paddedCutoff2 = (nonbondedCutoff+padding)*(nonbondedCutoff+padding);
for (int i = 1; i < numMoved && !needRecompute; i++)
for (int j = 0; j < i; j++) {
RealVec delta = posData[moved[i]]-posData[moved[j]];
......@@ -251,7 +523,7 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
// These particles should interact. See if they are in the neighbor list.
RealVec oldDelta = lastPositions[moved[i]]-lastPositions[moved[j]];
if (oldDelta.dot(oldDelta) > cutoff2) {
if (oldDelta.dot(oldDelta) > paddedCutoff2) {
needRecompute = true;
break;
}
......@@ -259,47 +531,42 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
}
}
if (needRecompute) {
neighborList.computeNeighborList(numParticles, posq, exclusions, floatBoxSize, periodic || ewald || pme, nonbondedCutoff+padding);
neighborList->computeNeighborList(numParticles, posq, exclusions, floatBoxSize, data.isPeriodic, nonbondedCutoff+padding, data.threads);
lastPositions = posData;
}
nonbonded.setUseCutoff(nonbondedCutoff, neighborList, rfDielectric);
nonbonded->setUseCutoff(nonbondedCutoff, *neighborList, rfDielectric);
}
if (periodic || ewald || pme) {
if (data.isPeriodic) {
double minAllowedSize = 1.999999*nonbondedCutoff;
if (boxSize[0] < minAllowedSize || boxSize[1] < minAllowedSize || boxSize[2] < minAllowedSize)
throw OpenMMException("The periodic box size has decreased to less than twice the nonbonded cutoff.");
nonbonded.setPeriodic(floatBoxSize);
nonbonded->setPeriodic(floatBoxSize);
}
if (ewald)
nonbonded.setUseEwald(ewaldAlpha, kmax[0], kmax[1], kmax[2]);
nonbonded->setUseEwald(ewaldAlpha, kmax[0], kmax[1], kmax[2]);
if (pme)
nonbonded.setUsePME(ewaldAlpha, gridSize);
nonbonded->setUsePME(ewaldAlpha, gridSize);
if (useSwitchingFunction)
nonbonded.setUseSwitchingFunction(switchingDistance);
float nonbondedEnergy = 0;
nonbonded->setUseSwitchingFunction(switchingDistance);
double nonbondedEnergy = 0;
if (includeDirect)
nonbonded.calculateDirectIxn(numParticles, &posq[0], particleParams, exclusions, &forces[0], includeEnergy ? &nonbondedEnergy : NULL);
nonbonded->calculateDirectIxn(numParticles, &posq[0], posData, particleParams, exclusions, data.threadForce, includeEnergy ? &nonbondedEnergy : NULL, data.threads);
if (includeReciprocal) {
if (useOptimizedPme) {
PmeIO io(&posq[0], &forces[0], numParticles);
PmeIO io(&posq[0], &data.threadForce[0][0], numParticles);
Vec3 periodicBoxSize(boxSize[0], boxSize[1], boxSize[2]);
optimizedPme.getAs<CalcPmeReciprocalForceKernel>().beginComputation(io, periodicBoxSize, includeEnergy);
optimizedPme.getAs<CalcPmeReciprocalForceKernel>().finishComputation(io);
nonbondedEnergy += optimizedPme.getAs<CalcPmeReciprocalForceKernel>().finishComputation(io);
}
else
nonbonded.calculateReciprocalIxn(numParticles, &posq[0], posData, particleParams, exclusions, forceData, includeEnergy ? &nonbondedEnergy : NULL);
nonbonded->calculateReciprocalIxn(numParticles, &posq[0], posData, particleParams, exclusions, forceData, includeEnergy ? &nonbondedEnergy : NULL);
}
energy += nonbondedEnergy;
for (int i = 0; i < numParticles; i++) {
forceData[i][0] += forces[4*i];
forceData[i][1] += forces[4*i+1];
forceData[i][2] += forces[4*i+2];
}
if (includeDirect) {
ReferenceBondForce refBondForce;
ReferenceLJCoulomb14 nonbonded14;
refBondForce.calculateForce(num14, bonded14IndexArray, posData, bonded14ParamArray, forceData, includeEnergy ? &energy : NULL, nonbonded14);
if (periodic || ewald || pme)
if (data.isPeriodic)
energy += dispersionCoefficient/(boxSize[0]*boxSize[1]*boxSize[2]);
}
return energy;
......@@ -325,7 +592,7 @@ void CpuCalcNonbondedForceKernel::copyParametersToContext(ContextImpl& context,
for (int i = 0; i < numParticles; ++i) {
double charge, radius, depth;
force.getParticleParameters(i, charge, radius, depth);
posq[4*i+3] = (float) charge;
data.posq[4*i+3] = (float) charge;
particleParams[i] = make_pair((float) (0.5*radius), (float) (2.0*sqrt(depth)));
sumSquaredCharges += charge*charge;
}
......@@ -350,3 +617,94 @@ void CpuCalcNonbondedForceKernel::copyParametersToContext(ContextImpl& context,
if (force.getUseDispersionCorrection() && (method == NonbondedForce::CutoffPeriodic || method == NonbondedForce::Ewald || method == NonbondedForce::PME))
dispersionCoefficient = NonbondedForceImpl::calcDispersionCorrection(context.getSystem(), force);
}
CpuCalcGBSAOBCForceKernel::~CpuCalcGBSAOBCForceKernel() {
}
void CpuCalcGBSAOBCForceKernel::initialize(const System& system, const GBSAOBCForce& force) {
int numParticles = system.getNumParticles();
particleParams.resize(numParticles);
for (int i = 0; i < numParticles; ++i) {
double charge, radius, scalingFactor;
force.getParticleParameters(i, charge, radius, scalingFactor);
data.posq[4*i+3] = (float) charge;
radius -= 0.009;
particleParams[i] = make_pair((float) radius, (float) (scalingFactor*radius));
}
obc.setParticleParameters(particleParams);
obc.setSolventDielectric((float) force.getSolventDielectric());
obc.setSoluteDielectric((float) force.getSoluteDielectric());
if (force.getNonbondedMethod() != GBSAOBCForce::NoCutoff)
obc.setUseCutoff((float) force.getCutoffDistance());
data.isPeriodic = (force.getNonbondedMethod() == GBSAOBCForce::CutoffPeriodic);
}
double CpuCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
if (data.isPeriodic) {
RealVec& boxSize = extractBoxSize(context);
float floatBoxSize[3] = {(float) boxSize[0], (float) boxSize[1], (float) boxSize[2]};
obc.setPeriodic(floatBoxSize);
}
double energy = 0.0;
obc.computeForce(data.posq, data.threadForce, includeEnergy ? &energy : NULL, data.threads);
return energy;
}
void CpuCalcGBSAOBCForceKernel::copyParametersToContext(ContextImpl& context, const GBSAOBCForce& force) {
int numParticles = force.getNumParticles();
if (numParticles != obc.getParticleParameters().size())
throw OpenMMException("updateParametersInContext: The number of particles has changed");
// Record the values.
for (int i = 0; i < numParticles; ++i) {
double charge, radius, scalingFactor;
force.getParticleParameters(i, charge, radius, scalingFactor);
data.posq[4*i+3] = (float) charge;
radius -= 0.009;
particleParams[i] = make_pair((float) radius, (float) (scalingFactor*radius));
}
obc.setParticleParameters(particleParams);
}
CpuIntegrateLangevinStepKernel::~CpuIntegrateLangevinStepKernel() {
if (dynamics)
delete dynamics;
}
void CpuIntegrateLangevinStepKernel::initialize(const System& system, const LangevinIntegrator& integrator) {
int numParticles = system.getNumParticles();
masses.resize(numParticles);
for (int i = 0; i < numParticles; ++i)
masses[i] = static_cast<RealOpenMM>(system.getParticleMass(i));
data.random.initialize(integrator.getRandomNumberSeed(), data.threads.getNumThreads());
}
void CpuIntegrateLangevinStepKernel::execute(ContextImpl& context, const LangevinIntegrator& integrator) {
double temperature = integrator.getTemperature();
double friction = integrator.getFriction();
double stepSize = integrator.getStepSize();
vector<RealVec>& posData = extractPositions(context);
vector<RealVec>& velData = extractVelocities(context);
vector<RealVec>& forceData = extractForces(context);
if (dynamics == 0 || temperature != prevTemp || friction != prevFriction || stepSize != prevStepSize) {
// Recreate the computation objects with the new parameters.
if (dynamics)
delete dynamics;
RealOpenMM tau = (friction == 0.0 ? 0.0 : 1.0/friction);
dynamics = new CpuLangevinDynamics(context.getSystem().getNumParticles(), stepSize, tau, temperature, data.threads, data.random);
dynamics->setReferenceConstraintAlgorithm(&extractConstraints(context));
prevTemp = temperature;
prevFriction = friction;
prevStepSize = stepSize;
}
dynamics->update(context.getSystem(), posData, velData, forceData, masses, integrator.getConstraintTolerance());
ReferencePlatform::PlatformData* refData = reinterpret_cast<ReferencePlatform::PlatformData*>(context.getPlatformData());
refData->time += stepSize;
refData->stepCount++;
}
double CpuIntegrateLangevinStepKernel::computeKineticEnergy(ContextImpl& context, const LangevinIntegrator& integrator) {
return computeShiftedKineticEnergy(context, masses, 0.5*integrator.getStepSize());
}
/* Portions copyright (c) 2006-2013 Stanford University and Simbios.
* Authors: Peter Eastman
* Contributors:
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject
* to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE
* LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
* OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
* WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include "SimTKOpenMMCommon.h"
#include "SimTKOpenMMLog.h"
#include "SimTKOpenMMUtilities.h"
#include "CpuLangevinDynamics.h"
using namespace OpenMM;
using namespace std;
class CpuLangevinDynamics::Update1Task : public ThreadPool::Task {
public:
Update1Task(CpuLangevinDynamics& owner) : owner(owner) {
}
void execute(ThreadPool& threads, int threadIndex) {
owner.threadUpdate1(threadIndex);
}
CpuLangevinDynamics& owner;
};
class CpuLangevinDynamics::Update2Task : public ThreadPool::Task {
public:
Update2Task(CpuLangevinDynamics& owner) : owner(owner) {
}
void execute(ThreadPool& threads, int threadIndex) {
owner.threadUpdate2(threadIndex);
}
CpuLangevinDynamics& owner;
};
CpuLangevinDynamics::CpuLangevinDynamics(int numberOfAtoms, RealOpenMM deltaT, RealOpenMM tau, RealOpenMM temperature, ThreadPool& threads, CpuRandom& random) :
ReferenceStochasticDynamics(numberOfAtoms, deltaT, tau, temperature), threads(threads), random(random) {
}
CpuLangevinDynamics::~CpuLangevinDynamics() {
}
void CpuLangevinDynamics::updatePart1(int numberOfAtoms, vector<RealVec>& atomCoordinates, vector<RealVec>& velocities,
vector<RealVec>& forces, vector<RealOpenMM>& inverseMasses, vector<RealVec>& xPrime) {
// Record the parameters for the threads.
this->numberOfAtoms = numberOfAtoms;
this->atomCoordinates = &atomCoordinates[0];
this->velocities = &velocities[0];
this->forces = &forces[0];
this->inverseMasses = &inverseMasses[0];
this->xPrime = &xPrime[0];
// Signal the threads to start running and wait for them to finish.
Update1Task task(*this);
threads.execute(task);
threads.waitForThreads();
}
void CpuLangevinDynamics::updatePart2(int numberOfAtoms, vector<RealVec>& atomCoordinates, vector<RealVec>& velocities,
vector<RealVec>& forces, vector<RealOpenMM>& inverseMasses, vector<RealVec>& xPrime) {
// Record the parameters for the threads.
this->numberOfAtoms = numberOfAtoms;
this->atomCoordinates = &atomCoordinates[0];
this->velocities = &velocities[0];
this->forces = &forces[0];
this->inverseMasses = &inverseMasses[0];
this->xPrime = &xPrime[0];
// Signal the threads to start running and wait for them to finish.
Update2Task task(*this);
threads.execute(task);
threads.waitForThreads();
}
void CpuLangevinDynamics::threadUpdate1(int threadIndex) {
const RealOpenMM tau = getTau();
const RealOpenMM vscale = EXP(-getDeltaT()/tau);
const RealOpenMM fscale = (1-vscale)*tau;
const RealOpenMM kT = BOLTZ*getTemperature();
const RealOpenMM noisescale = SQRT(2*kT/tau)*SQRT(0.5*(1-vscale*vscale)*tau);
int start = threadIndex*numberOfAtoms/threads.getNumThreads();
int end = (threadIndex+1)*numberOfAtoms/threads.getNumThreads();
for (int i = start; i < end; i++) {
if (inverseMasses[i] != 0.0) {
RealOpenMM sqrtInvMass = SQRT(inverseMasses[i]);
RealVec noise(random.getGaussianRandom(threadIndex), random.getGaussianRandom(threadIndex), random.getGaussianRandom(threadIndex));
velocities[i] = velocities[i]*vscale + forces[i]*(fscale*inverseMasses[i]) + noise*(noisescale*sqrtInvMass);
}
}
}
void CpuLangevinDynamics::threadUpdate2(int threadIndex) {
const RealOpenMM dt = getDeltaT();
int start = threadIndex*numberOfAtoms/threads.getNumThreads();
int end = (threadIndex+1)*numberOfAtoms/threads.getNumThreads();
for (int i = start; i < end; i++) {
if (inverseMasses[i] != 0.0) {
RealOpenMM sqrtInvMass = SQRT(inverseMasses[i]);
xPrime[i] = atomCoordinates[i]+velocities[i]*dt;
}
}
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2013 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include "CpuNeighborList.h"
#include "openmm/internal/hardware.h"
#include "openmm/internal/vectorize.h"
......@@ -12,27 +43,25 @@ using namespace std;
namespace OpenMM {
const int CpuNeighborList::BlockSize = 4;
class VoxelIndex
{
public:
VoxelIndex() : x(0), y(0) {
}
VoxelIndex(int x, int y) : x(x), y(y) {
}
int x;
int y;
};
typedef pair<const float*, int> VoxelItem;
/**
* This data structure organizes the particles spatially. It divides them into bins along the x and y axes,
* then sorts each bin along the z axis so ranges can be identified quickly with a binary search.
*/
class CpuNeighborList::Voxels {
public:
Voxels(float vsx, float vsy, float minx, float maxx, float miny, float maxy, const float* periodicBoxSize, bool usePeriodic) :
voxelSizeX(vsx), voxelSizeY(vsy), minx(minx), maxx(maxx), miny(miny), maxy(maxy), periodicBoxSize(periodicBoxSize), usePeriodic(usePeriodic) {
Voxels(int blockSize, float vsx, float vsy, float minx, float maxx, float miny, float maxy, const float* periodicBoxSize, bool usePeriodic) :
blockSize(blockSize), voxelSizeX(vsx), voxelSizeY(vsy), minx(minx), maxx(maxx), miny(miny), maxy(maxy), periodicBoxSize(periodicBoxSize), usePeriodic(usePeriodic) {
if (usePeriodic) {
nx = (int) floorf(periodicBoxSize[0]/voxelSizeX+0.5f);
ny = (int) floorf(periodicBoxSize[1]/voxelSizeY+0.5f);
......@@ -60,7 +89,7 @@ public:
*/
void insert(const int& atom, const float* location) {
VoxelIndex voxelIndex = getVoxelIndex(location);
bins[voxelIndex.x][voxelIndex.y].push_back(make_pair(location[2], VoxelItem(location, atom)));
bins[voxelIndex.x][voxelIndex.y].push_back(make_pair(location[2], atom));
}
/**
......@@ -76,7 +105,7 @@ public:
* Find the index of the first particle in voxel (x,y) whose z coordinate in >= the specified value.
*/
int findLowerBound(int x, int y, double z) const {
const vector<pair<float, VoxelItem> >& bin = bins[x][y];
const vector<pair<float, int> >& bin = bins[x][y];
int lower = 0;
int upper = bin.size();
while (lower < upper) {
......@@ -93,7 +122,7 @@ public:
* Find the index of the first particle in voxel (x,y) whose z coordinate in greater than the specified value.
*/
int findUpperBound(int x, int y, double z) const {
const vector<pair<float, VoxelItem> >& bin = bins[x][y];
const vector<pair<float, int> >& bin = bins[x][y];
int lower = 0;
int upper = bin.size();
while (lower < upper) {
......@@ -125,7 +154,7 @@ public:
return VoxelIndex(x, y);
}
void getNeighbors(vector<int>& neighbors, int blockIndex, fvec4 blockCenter, fvec4 blockWidth, const vector<int>& sortedAtoms, vector<char>& exclusions, float maxDistance, const vector<int> blockAtoms, const float* atomLocations) const {
void getNeighbors(vector<int>& neighbors, int blockIndex, const fvec4& blockCenter, const fvec4& blockWidth, const vector<int>& sortedAtoms, vector<char>& exclusions, float maxDistance, const vector<int>& blockAtoms, const float* atomLocations, const vector<VoxelIndex>& atomVoxelIndex) const {
neighbors.resize(0);
exclusions.resize(0);
fvec4 boxSize(periodicBoxSize[0], periodicBoxSize[1], periodicBoxSize[2], 0);
......@@ -135,8 +164,12 @@ public:
float refineCutoff = maxDistance-max(max(blockWidth[0], blockWidth[1]), blockWidth[2]);
float refineCutoffSquared = refineCutoff*refineCutoff;
int dIndexX = min(nx/2, int((maxDistance+blockWidth[0])/voxelSizeX)+1); // How may voxels away do we have to look?
int dIndexY = min(ny/2, int((maxDistance+blockWidth[1])/voxelSizeY)+1);
int dIndexX = int((maxDistance+blockWidth[0])/voxelSizeX)+1; // How may voxels away do we have to look?
int dIndexY = int((maxDistance+blockWidth[1])/voxelSizeY)+1;
if (usePeriodic) {
dIndexX = min(nx/2, dIndexX);
dIndexY = min(ny/2, dIndexY);
}
float centerPos[4];
blockCenter.store(centerPos);
VoxelIndex centerVoxelIndex = getVoxelIndex(centerPos);
......@@ -148,63 +181,87 @@ public:
if (usePeriodic) {
endx = min(endx, centerVoxelIndex.x-dIndexX+nx-1);
endy = min(endy, centerVoxelIndex.y-dIndexY+ny-1);
numRanges = 2;
}
else {
startx = max(startx, 0);
starty = max(starty, 0);
endx = min(endx, nx-1);
endy = min(endy, ny-1);
numRanges = 1;
}
int lastSortedIndex = BlockSize*(blockIndex+1);
int lastSortedIndex = blockSize*(blockIndex+1);
VoxelIndex voxelIndex(0, 0);
for (int x = startx; x <= endx; ++x) {
voxelIndex.x = x;
if (usePeriodic)
voxelIndex.x = (x < 0 ? x+nx : (x >= nx ? x-nx : x));
float dx = max(0.0f, voxelSizeX*max(0, abs(centerVoxelIndex.x-x)-1)-blockWidth[0]);
for (int y = starty; y <= endy; ++y) {
voxelIndex.y = y;
if (usePeriodic)
voxelIndex.y = (y < 0 ? y+ny : (y >= ny ? y-ny : y));
float dy = max(0.0f, voxelSizeY*max(0, abs(centerVoxelIndex.y-y)-1)-blockWidth[1]);
// Identify the range of atoms within this bin we need to search. When using periodic boundary
// conditions, there may be two separate ranges.
float dz = maxDistance+blockWidth[2];
dz = sqrtf(max(0.0f, dz*dz-dx*dx-dy*dy));
float minz = centerPos[2];
float maxz = centerPos[2];
fvec4 offset(voxelSizeX*x+(usePeriodic ? 0.0f : minx), voxelSizeY*y+(usePeriodic ? 0.0f : miny), 0, 0);
for (int k = 0; k < (int) blockAtoms.size(); k++) {
const float* atomPos = &atomLocations[4*blockAtoms[k]];
fvec4 posVec(atomPos);
fvec4 delta1 = offset-posVec;
fvec4 delta2 = delta1+fvec4(voxelSizeX, voxelSizeY, 0, 0);
if (usePeriodic) {
delta1 -= round(delta1*invBoxSize)*boxSize;
delta2 -= round(delta2*invBoxSize)*boxSize;
}
fvec4 delta = min(abs(delta1), abs(delta2));
float dx = (x == atomVoxelIndex[k].x ? 0.0f : delta[0]);
float dy = (y == atomVoxelIndex[k].y ? 0.0f : delta[1]);
float dist2 = maxDistanceSquared-dx*dx-dy*dy;
if (dist2 > 0) {
float dist = sqrtf(dist2);
minz = min(minz, atomPos[2]-dist);
maxz = max(maxz, atomPos[2]+dist);
}
}
if (minz == maxz)
continue;
bool needPeriodic = (centerPos[0]-blockWidth[0] < maxDistance || centerPos[0]+blockWidth[0] > periodicBoxSize[0]-maxDistance ||
centerPos[1]-blockWidth[1] < maxDistance || centerPos[1]+blockWidth[1] > periodicBoxSize[1]-maxDistance ||
minz < 0.0f || maxz > periodicBoxSize[2]);
int rangeStart[2];
int rangeEnd[2];
rangeStart[0] = findLowerBound(voxelIndex.x, voxelIndex.y, centerPos[2]-dz);
if (usePeriodic) {
rangeEnd[0] = findUpperBound(voxelIndex.x, voxelIndex.y, centerPos[2]+dz);
rangeStart[0] = findLowerBound(voxelIndex.x, voxelIndex.y, minz);
if (needPeriodic) {
numRanges = 2;
rangeEnd[0] = findUpperBound(voxelIndex.x, voxelIndex.y, maxz);
if (rangeStart[0] > 0) {
rangeStart[1] = 0;
rangeEnd[1] = min(findUpperBound(voxelIndex.x, voxelIndex.y, centerPos[2]+dz-periodicBoxSize[2]), rangeStart[0]);
rangeEnd[1] = min(findUpperBound(voxelIndex.x, voxelIndex.y, maxz-periodicBoxSize[2]), rangeStart[0]);
}
else {
rangeStart[1] = max(findLowerBound(voxelIndex.x, voxelIndex.y, centerPos[2]-dz+periodicBoxSize[2]), rangeEnd[0]);
rangeStart[1] = max(findLowerBound(voxelIndex.x, voxelIndex.y, minz+periodicBoxSize[2]), rangeEnd[0]);
rangeEnd[1] = bins[voxelIndex.x][voxelIndex.y].size();
}
}
else
rangeEnd[0] = findUpperBound(voxelIndex.x, voxelIndex.y, centerPos[2]+dz);
else {
numRanges = 1;
rangeEnd[0] = findUpperBound(voxelIndex.x, voxelIndex.y, maxz);
}
// Loop over atoms and check to see if they are neighbors of this block.
for (int range = 0; range < numRanges; range++) {
for (int item = rangeStart[range]; item < rangeEnd[range]; item++) {
const int sortedIndex = bins[voxelIndex.x][voxelIndex.y][item].second.second;
const int sortedIndex = bins[voxelIndex.x][voxelIndex.y][item].second;
// Avoid duplicate entries.
if (sortedIndex >= lastSortedIndex)
continue;
fvec4 atomPos(bins[voxelIndex.x][voxelIndex.y][item].second.first);
fvec4 atomPos(atomLocations+4*sortedAtoms[sortedIndex]);
fvec4 delta = atomPos-centerPos;
if (usePeriodic) {
if (needPeriodic) {
fvec4 base = round(delta*invBoxSize)*boxSize;
delta = delta-base;
}
......@@ -221,7 +278,7 @@ public:
for (int k = 0; k < (int) blockAtoms.size(); k++) {
fvec4 pos1(&atomLocations[4*blockAtoms[k]]);
delta = atomPos-pos1;
if (usePeriodic) {
if (needPeriodic) {
fvec4 base = round(delta*invBoxSize)*boxSize;
delta = delta-base;
}
......@@ -238,10 +295,12 @@ public:
// Add this atom to the list of neighbors.
neighbors.push_back(sortedAtoms[sortedIndex]);
if (sortedIndex < BlockSize*blockIndex)
if (sortedIndex < blockSize*blockIndex)
exclusions.push_back(0);
else
exclusions.push_back(0xF & (0xF<<(sortedIndex-BlockSize*blockIndex)));
else {
int mask = (1<<blockSize)-1;
exclusions.push_back(mask & (mask<<(sortedIndex-blockSize*blockIndex)));
}
}
}
}
......@@ -249,63 +308,31 @@ public:
}
private:
int blockSize;
float voxelSizeX, voxelSizeY;
float minx, maxx, miny, maxy;
int nx, ny;
const float* periodicBoxSize;
const bool usePeriodic;
vector<vector<vector<pair<float, VoxelItem> > > > bins;
vector<vector<vector<pair<float, int> > > > bins;
};
class CpuNeighborList::ThreadData {
class CpuNeighborList::ThreadTask : public ThreadPool::Task {
public:
ThreadData(int index, CpuNeighborList& owner) : index(index), owner(owner) {
ThreadTask(CpuNeighborList& owner) : owner(owner) {
}
void execute(ThreadPool& threads, int threadIndex) {
owner.threadComputeNeighborList(threads, threadIndex);
}
int index;
CpuNeighborList& owner;
};
static void* threadBody(void* args) {
CpuNeighborList::ThreadData& data = *reinterpret_cast<CpuNeighborList::ThreadData*>(args);
data.owner.runThread(data.index);
delete &data;
return 0;
CpuNeighborList::CpuNeighborList(int blockSize) : blockSize(blockSize) {
}
CpuNeighborList::CpuNeighborList() {
isDeleted = false;
numThreads = getNumProcessors();
pthread_cond_init(&startCondition, NULL);
pthread_cond_init(&endCondition, NULL);
pthread_mutex_init(&lock, NULL);
thread.resize(numThreads);
pthread_mutex_lock(&lock);
waitCount = 0;
for (int i = 0; i < numThreads; i++) {
ThreadData* data = new ThreadData(i, *this);
threadData.push_back(data);
pthread_create(&thread[i], NULL, threadBody, data);
}
while (waitCount < numThreads)
pthread_cond_wait(&endCondition, &lock);
pthread_mutex_unlock(&lock);
}
CpuNeighborList::~CpuNeighborList() {
isDeleted = true;
pthread_mutex_lock(&lock);
pthread_cond_broadcast(&startCondition);
pthread_mutex_unlock(&lock);
for (int i = 0; i < (int) thread.size(); i++)
pthread_join(thread[i], NULL);
pthread_mutex_destroy(&lock);
pthread_cond_destroy(&startCondition);
pthread_cond_destroy(&endCondition);
}
void CpuNeighborList::computeNeighborList(int numAtoms, const vector<float>& atomLocations, const vector<set<int> >& exclusions,
const float* periodicBoxSize, bool usePeriodic, float maxDistance) {
int numBlocks = (numAtoms+BlockSize-1)/BlockSize;
void CpuNeighborList::computeNeighborList(int numAtoms, const AlignedArray<float>& atomLocations, const vector<set<int> >& exclusions,
const float* periodicBoxSize, bool usePeriodic, float maxDistance, ThreadPool& threads) {
int numBlocks = (numAtoms+blockSize-1)/blockSize;
blockNeighbors.resize(numBlocks);
blockExclusions.resize(numBlocks);
sortedAtoms.resize(numAtoms);
......@@ -338,8 +365,9 @@ void CpuNeighborList::computeNeighborList(int numAtoms, const vector<float>& ato
// Sort the atoms based on a Hilbert curve.
atomBins.resize(numAtoms);
pthread_mutex_lock(&lock);
advanceThreads();
ThreadTask task(*this);
threads.execute(task);
threads.waitForThreads();
sort(atomBins.begin(), atomBins.end());
// Build the voxel hash.
......@@ -348,10 +376,10 @@ void CpuNeighborList::computeNeighborList(int numAtoms, const vector<float>& ato
if (!usePeriodic)
edgeSizeX = edgeSizeY = maxDistance; // TODO - adjust this as needed
else {
edgeSizeX = 0.5f*periodicBoxSize[0]/floorf(periodicBoxSize[0]/maxDistance);
edgeSizeY = 0.5f*periodicBoxSize[1]/floorf(periodicBoxSize[1]/maxDistance);
edgeSizeX = 0.6f*periodicBoxSize[0]/floorf(periodicBoxSize[0]/maxDistance);
edgeSizeY = 0.6f*periodicBoxSize[1]/floorf(periodicBoxSize[1]/maxDistance);
}
Voxels voxels(edgeSizeX, edgeSizeY, minx, maxx, miny, maxy, periodicBoxSize, usePeriodic);
Voxels voxels(blockSize, edgeSizeX, edgeSizeY, minx, maxx, miny, maxy, periodicBoxSize, usePeriodic);
for (int i = 0; i < numAtoms; i++) {
int atomIndex = atomBins[i].second;
sortedAtoms[i] = atomIndex;
......@@ -362,14 +390,14 @@ void CpuNeighborList::computeNeighborList(int numAtoms, const vector<float>& ato
// Signal the threads to start running and wait for them to finish.
advanceThreads();
pthread_mutex_unlock(&lock);
threads.resumeThreads();
threads.waitForThreads();
// Add padding atoms to fill up the last block.
int numPadding = numBlocks*BlockSize-numAtoms;
int numPadding = numBlocks*blockSize-numAtoms;
if (numPadding > 0) {
char mask = (0xF0 >> numPadding) & 0xF;
char mask = ((0xFFFF-(1<<blockSize)+1) >> numPadding);
for (int i = 0; i < numPadding; i++)
sortedAtoms.push_back(0);
vector<char>& exc = blockExclusions[blockExclusions.size()-1];
......@@ -379,7 +407,7 @@ void CpuNeighborList::computeNeighborList(int numAtoms, const vector<float>& ato
}
int CpuNeighborList::getNumBlocks() const {
return sortedAtoms.size()/BlockSize;
return sortedAtoms.size()/blockSize;
}
const std::vector<int>& CpuNeighborList::getSortedAtoms() const {
......@@ -395,82 +423,59 @@ const std::vector<char>& CpuNeighborList::getBlockExclusions(int blockIndex) con
}
void CpuNeighborList::runThread(int index) {
while (true) {
// Wait for the signal to start running.
threadWait();
if (isDeleted)
break;
// Compute the positions of atoms along the Hilbert curve.
void CpuNeighborList::threadComputeNeighborList(ThreadPool& threads, int threadIndex) {
// Compute the positions of atoms along the Hilbert curve.
float binWidth = max(max(maxx-minx, maxy-miny), maxz-minz)/255.0f;
float invBinWidth = 1.0f/binWidth;
bitmask_t coords[3];
for (int i = index; i < numAtoms; i += numThreads) {
const float* pos = &atomLocations[4*i];
coords[0] = (bitmask_t) ((pos[0]-minx)*invBinWidth);
coords[1] = (bitmask_t) ((pos[1]-miny)*invBinWidth);
coords[2] = (bitmask_t) ((pos[2]-minz)*invBinWidth);
int bin = (int) hilbert_c2i(3, 8, coords);
atomBins[i] = pair<int, int>(bin, i);
}
threadWait();
// Compute this thread's subset of neighbors.
int numBlocks = blockNeighbors.size();
vector<int> blockAtoms;
for (int i = index; i < numBlocks; i += numThreads) {
{
int firstIndex = BlockSize*i;
int atomsInBlock = min(BlockSize, numAtoms-firstIndex);
blockAtoms.resize(atomsInBlock);
for (int j = 0; j < atomsInBlock; j++)
blockAtoms[j] = sortedAtoms[firstIndex+j];
}
float binWidth = max(max(maxx-minx, maxy-miny), maxz-minz)/255.0f;
float invBinWidth = 1.0f/binWidth;
bitmask_t coords[3];
int numThreads = threads.getNumThreads();
for (int i = threadIndex; i < numAtoms; i += numThreads) {
const float* pos = &atomLocations[4*i];
coords[0] = (bitmask_t) ((pos[0]-minx)*invBinWidth);
coords[1] = (bitmask_t) ((pos[1]-miny)*invBinWidth);
coords[2] = (bitmask_t) ((pos[2]-minz)*invBinWidth);
int bin = (int) hilbert_c2i(3, 8, coords);
atomBins[i] = pair<int, int>(bin, i);
}
threads.syncThreads();
int firstIndex = BlockSize*i;
fvec4 minPos(&atomLocations[4*sortedAtoms[firstIndex]]);
fvec4 maxPos = minPos;
int atomsInBlock = min(BlockSize, numAtoms-firstIndex);
for (int j = 1; j < atomsInBlock; j++) {
fvec4 pos(&atomLocations[4*sortedAtoms[firstIndex+j]]);
minPos = min(minPos, pos);
maxPos = max(maxPos, pos);
}
voxels->getNeighbors(blockNeighbors[i], i, (maxPos+minPos)*0.5f, (maxPos-minPos)*0.5f, sortedAtoms, blockExclusions[i], maxDistance, blockAtoms, atomLocations);
// Record the exclusions for this block.
for (int j = 0; j < atomsInBlock; j++) {
const set<int>& atomExclusions = (*exclusions)[sortedAtoms[firstIndex+j]];
char mask = 1<<j;
for (int k = 0; k < (int) blockNeighbors[i].size(); k++) {
int atomIndex = blockNeighbors[i][k];
if (atomExclusions.find(atomIndex) != atomExclusions.end())
blockExclusions[i][k] |= mask;
}
}
// Compute this thread's subset of neighbors.
int numBlocks = blockNeighbors.size();
vector<int> blockAtoms;
vector<VoxelIndex> atomVoxelIndex;
for (int i = threadIndex; i < numBlocks; i += numThreads) {
// Find the atoms in this block and compute their bounding box.
int firstIndex = blockSize*i;
int atomsInBlock = min(blockSize, numAtoms-firstIndex);
blockAtoms.resize(atomsInBlock);
atomVoxelIndex.resize(atomsInBlock);
for (int j = 0; j < atomsInBlock; j++) {
blockAtoms[j] = sortedAtoms[firstIndex+j];
atomVoxelIndex[j] = voxels->getVoxelIndex(&atomLocations[4*blockAtoms[j]]);
}
}
}
fvec4 minPos(&atomLocations[4*sortedAtoms[firstIndex]]);
fvec4 maxPos = minPos;
for (int j = 1; j < atomsInBlock; j++) {
fvec4 pos(&atomLocations[4*sortedAtoms[firstIndex+j]]);
minPos = min(minPos, pos);
maxPos = max(maxPos, pos);
}
voxels->getNeighbors(blockNeighbors[i], i, (maxPos+minPos)*0.5f, (maxPos-minPos)*0.5f, sortedAtoms, blockExclusions[i], maxDistance, blockAtoms, atomLocations, atomVoxelIndex);
void CpuNeighborList::threadWait() {
pthread_mutex_lock(&lock);
waitCount++;
pthread_cond_signal(&endCondition);
pthread_cond_wait(&startCondition, &lock);
pthread_mutex_unlock(&lock);
}
// Record the exclusions for this block.
void CpuNeighborList::advanceThreads() {
waitCount = 0;
pthread_cond_broadcast(&startCondition);
while (waitCount < numThreads) {
pthread_cond_wait(&endCondition, &lock);
for (int j = 0; j < atomsInBlock; j++) {
const set<int>& atomExclusions = (*exclusions)[sortedAtoms[firstIndex+j]];
char mask = 1<<j;
for (int k = 0; k < (int) blockNeighbors[i].size(); k++) {
int atomIndex = blockNeighbors[i][k];
if (atomExclusions.find(atomIndex) != atomExclusions.end())
blockExclusions[i][k] |= mask;
}
}
}
}
......
......@@ -22,7 +22,6 @@
* WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include <string.h>
#include <complex>
#include "SimTKOpenMMCommon.h"
......@@ -30,9 +29,8 @@
#include "CpuNonbondedForce.h"
#include "ReferenceForce.h"
#include "ReferencePME.h"
#include "openmm/internal/hardware.h"
#include "openmm/internal/SplineFitter.h"
#include "openmm/internal/vectorize.h"
#include "gmx_atomic.h"
#include <algorithm>
// In case we're using some primitive version of Visual Studio this will
// make sure that erf() and erfc() are defined.
......@@ -42,80 +40,43 @@ using namespace std;
using namespace OpenMM;
const float CpuNonbondedForce::TWO_OVER_SQRT_PI = (float) (2/sqrt(PI_M));
const int CpuNonbondedForce::NUM_TABLE_POINTS = 1025;
const int CpuNonbondedForce::NUM_TABLE_POINTS = 2048;
class CpuNonbondedForce::ThreadData {
class CpuNonbondedForce::ComputeDirectTask : public ThreadPool::Task {
public:
ThreadData(int index, CpuNonbondedForce& owner) : index(index), owner(owner) {
ComputeDirectTask(CpuNonbondedForce& owner) : owner(owner) {
}
void execute(ThreadPool& threads, int threadIndex) {
owner.threadComputeDirect(threads, threadIndex);
}
int index;
CpuNonbondedForce& owner;
vector<float> threadForce;
double threadEnergy;
};
static void* threadBody(void* args) {
CpuNonbondedForce::ThreadData& data = *reinterpret_cast<CpuNonbondedForce::ThreadData*>(args);
data.owner.runThread(data.index, data.threadForce, data.threadEnergy);
delete &data;
return 0;
}
/**---------------------------------------------------------------------------------------
CpuNonbondedForce constructor
--------------------------------------------------------------------------------------- */
CpuNonbondedForce::CpuNonbondedForce() : cutoff(false), useSwitch(false), periodic(false), ewald(false), pme(false) {
isDeleted = false;
numThreads = getNumProcessors();
pthread_cond_init(&startCondition, NULL);
pthread_cond_init(&endCondition, NULL);
pthread_mutex_init(&lock, NULL);
thread.resize(numThreads);
pthread_mutex_lock(&lock);
waitCount = 0;
for (int i = 0; i < numThreads; i++) {
ThreadData* data = new ThreadData(i, *this);
threadData.push_back(data);
pthread_create(&thread[i], NULL, threadBody, data);
}
while (waitCount < numThreads)
pthread_cond_wait(&endCondition, &lock);
pthread_mutex_unlock(&lock);
CpuNonbondedForce::CpuNonbondedForce() : cutoff(false), useSwitch(false), periodic(false), ewald(false), pme(false), tableIsValid(false), cutoffDistance(0.0f), alphaEwald(0.0f) {
}
/**---------------------------------------------------------------------------------------
CpuNonbondedForce destructor
--------------------------------------------------------------------------------------- */
CpuNonbondedForce::~CpuNonbondedForce(){
isDeleted = true;
pthread_mutex_lock(&lock);
pthread_cond_broadcast(&startCondition);
pthread_mutex_unlock(&lock);
for (int i = 0; i < (int) thread.size(); i++)
pthread_join(thread[i], NULL);
pthread_mutex_destroy(&lock);
pthread_cond_destroy(&startCondition);
pthread_cond_destroy(&endCondition);
CpuNonbondedForce::~CpuNonbondedForce() {
}
/**---------------------------------------------------------------------------------------
/**---------------------------------------------------------------------------------------
Set the force to use a cutoff.
Set the force to use a cutoff.
@param distance the cutoff distance
@param neighbors the neighbor list to use
@param solventDielectric the dielectric constant of the bulk solvent
@param distance the cutoff distance
@param neighbors the neighbor list to use
@param solventDielectric the dielectric constant of the bulk solvent
--------------------------------------------------------------------------------------- */
void CpuNonbondedForce::setUseCutoff(float distance, const CpuNeighborList& neighbors, float solventDielectric) {
void CpuNonbondedForce::setUseCutoff(float distance, const CpuNeighborList& neighbors, float solventDielectric) {
if (distance != cutoffDistance)
tableIsValid = false;
cutoff = true;
cutoffDistance = distance;
neighborList = &neighbors;
......@@ -170,6 +131,8 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {
--------------------------------------------------------------------------------------- */
void CpuNonbondedForce::setUseEwald(float alpha, int kmaxx, int kmaxy, int kmaxz) {
if (alpha != alphaEwald)
tableIsValid = false;
alphaEwald = alpha;
numRx = kmaxx;
numRy = kmaxy;
......@@ -188,6 +151,8 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {
--------------------------------------------------------------------------------------- */
void CpuNonbondedForce::setUsePME(float alpha, int meshSize[3]) {
if (alpha != alphaEwald)
tableIsValid = false;
alphaEwald = alpha;
meshDim[0] = meshSize[0];
meshDim[1] = meshSize[1];
......@@ -198,35 +163,27 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {
void CpuNonbondedForce::tabulateEwaldScaleFactor() {
ewaldDX = cutoffDistance/(NUM_TABLE_POINTS-2);
if (tableIsValid)
return;
tableIsValid = true;
ewaldDX = cutoffDistance/NUM_TABLE_POINTS;
ewaldDXInv = 1.0f/ewaldDX;
vector<double> x(NUM_TABLE_POINTS+1);
vector<double> y(NUM_TABLE_POINTS+1);
vector<double> deriv;
for (int i = 0; i < NUM_TABLE_POINTS+1; i++) {
double r = i*cutoffDistance/(NUM_TABLE_POINTS-2);
ewaldScaleTable.resize(NUM_TABLE_POINTS+4);
for (int i = 0; i < NUM_TABLE_POINTS+4; i++) {
double r = i*ewaldDX;
double alphaR = alphaEwald*r;
x[i] = r;
y[i] = erfc(alphaR) + TWO_OVER_SQRT_PI*alphaR*exp(-alphaR*alphaR);
}
SplineFitter::createNaturalSpline(x, y, deriv);
ewaldScaleTable.resize(4*NUM_TABLE_POINTS);
for (int i = 0; i < NUM_TABLE_POINTS; i++) {
ewaldScaleTable[4*i] = (float) y[i];
ewaldScaleTable[4*i+1] = (float) y[i+1];
ewaldScaleTable[4*i+2] = (float) (deriv[i]*ewaldDX*ewaldDX/6);
ewaldScaleTable[4*i+3] = (float) (deriv[i+1]*ewaldDX*ewaldDX/6);
ewaldScaleTable[i] = erfc(alphaR) + TWO_OVER_SQRT_PI*alphaR*exp(-alphaR*alphaR);
}
}
void CpuNonbondedForce::calculateReciprocalIxn(int numberOfAtoms, float* posq, vector<RealVec>& atomCoordinates,
void CpuNonbondedForce::calculateReciprocalIxn(int numberOfAtoms, float* posq, const vector<RealVec>& atomCoordinates,
const vector<pair<float, float> >& atomParameters, const vector<set<int> >& exclusions,
vector<RealVec>& forces, float* totalEnergy) const {
vector<RealVec>& forces, double* totalEnergy) const {
typedef std::complex<float> d_complex;
static const float epsilon = 1.0;
int kmax = (ewald ? std::max(numRx, std::max(numRy,numRz)) : 0);
int kmax = (ewald ? max(numRx, max(numRy,numRz)) : 0);
float factorEwald = -1 / (4*alphaEwald*alphaEwald);
float TWO_PI = 2.0 * PI_M;
float recipCoeff = (float)(ONE_4PI_EPS0*4*PI_M/(periodicBoxSize[0] * periodicBoxSize[1] * periodicBoxSize[2]) /epsilon);
......@@ -333,111 +290,107 @@ void CpuNonbondedForce::calculateReciprocalIxn(int numberOfAtoms, float* posq, v
}
void CpuNonbondedForce::calculateDirectIxn(int numberOfAtoms, float* posq, const vector<pair<float, float> >& atomParameters,
const vector<set<int> >& exclusions, float* forces, float* totalEnergy) {
void CpuNonbondedForce::calculateDirectIxn(int numberOfAtoms, float* posq, const vector<RealVec>& atomCoordinates, const vector<pair<float, float> >& atomParameters,
const vector<set<int> >& exclusions, vector<AlignedArray<float> >& threadForce, double* totalEnergy, ThreadPool& threads) {
// Record the parameters for the threads.
this->numberOfAtoms = numberOfAtoms;
this->posq = posq;
this->atomCoordinates = &atomCoordinates[0];
this->atomParameters = &atomParameters[0];
this->exclusions = &exclusions[0];
this->threadForce = &threadForce;
includeEnergy = (totalEnergy != NULL);
threadEnergy.resize(threads.getNumThreads());
gmx_atomic_t counter;
gmx_atomic_set(&counter, 0);
this->atomicCounter = &counter;
// Signal the threads to start running and wait for them to finish.
pthread_mutex_lock(&lock);
waitCount = 0;
pthread_cond_broadcast(&startCondition);
while (waitCount < numThreads)
pthread_cond_wait(&endCondition, &lock);
pthread_mutex_unlock(&lock);
ComputeDirectTask task(*this);
threads.execute(task);
threads.waitForThreads();
// Combine the results from all the threads.
// Combine the energies from all the threads.
double directEnergy = 0;
for (int i = 0; i < numThreads; i++)
directEnergy += threadData[i]->threadEnergy;
for (int i = 0; i < numberOfAtoms; i++) {
fvec4 f(forces+4*i);
for (int j = 0; j < numThreads; j++)
f += fvec4(&threadData[j]->threadForce[4*i]);
f.store(forces+4*i);
if (totalEnergy != NULL) {
double directEnergy = 0;
int numThreads = threads.getNumThreads();
for (int i = 0; i < numThreads; i++)
directEnergy += threadEnergy[i];
*totalEnergy += directEnergy;
}
if (totalEnergy != NULL)
*totalEnergy += (float) directEnergy;
}
void CpuNonbondedForce::threadComputeDirect(ThreadPool& threads, int threadIndex) {
// Compute this thread's subset of interactions.
int numThreads = threads.getNumThreads();
threadEnergy[threadIndex] = 0;
double* energyPtr = (includeEnergy ? &threadEnergy[threadIndex] : NULL);
float* forces = &(*threadForce)[threadIndex][0];
fvec4 boxSize(periodicBoxSize[0], periodicBoxSize[1], periodicBoxSize[2], 0);
fvec4 invBoxSize((1/periodicBoxSize[0]), (1/periodicBoxSize[1]), (1/periodicBoxSize[2]), 0);
if (ewald || pme) {
// Compute the interactions from the neighbor list.
while (true) {
int nextBlock = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 1);
if (nextBlock >= neighborList->getNumBlocks())
break;
calculateBlockEwaldIxn(nextBlock, forces, energyPtr, boxSize, invBoxSize);
}
// Now subtract off the exclusions, since they were implicitly included in the reciprocal space sum.
void CpuNonbondedForce::runThread(int index, vector<float>& threadForce, double& threadEnergy) {
while (true) {
// Wait for the signal to start running.
pthread_mutex_lock(&lock);
waitCount++;
pthread_cond_signal(&endCondition);
pthread_cond_wait(&startCondition, &lock);
pthread_mutex_unlock(&lock);
if (isDeleted)
break;
// Compute this thread's subset of interactions.
threadEnergy = 0;
double* energyPtr = (includeEnergy ? &threadEnergy : NULL);
threadForce.resize(4*numberOfAtoms, 0.0f);
for (int i = 0; i < 4*numberOfAtoms; i++)
threadForce[i] = 0.0f;
fvec4 boxSize(periodicBoxSize[0], periodicBoxSize[1], periodicBoxSize[2], 0);
fvec4 invBoxSize((1/periodicBoxSize[0]), (1/periodicBoxSize[1]), (1/periodicBoxSize[2]), 0);
if (ewald || pme) {
// Compute the interactions from the neighbor list.
for (int i = index; i < neighborList->getNumBlocks(); i += numThreads)
calculateBlockEwaldIxn(i, &threadForce[0], energyPtr, boxSize, invBoxSize);
// Now subtract off the exclusions, since they were implicitly included in the reciprocal space sum.
fvec4 boxSize(periodicBoxSize[0], periodicBoxSize[1], periodicBoxSize[2], 0);
fvec4 invBoxSize((1/periodicBoxSize[0]), (1/periodicBoxSize[1]), (1/periodicBoxSize[2]), 0);
for (int i = index; i < numberOfAtoms; i += numThreads)
for (set<int>::const_iterator iter = exclusions[i].begin(); iter != exclusions[i].end(); ++iter) {
if (*iter > i) {
int j = *iter;
fvec4 deltaR;
fvec4 posI(posq+4*i);
fvec4 posJ(posq+4*j);
float r2;
getDeltaR(posJ, posI, deltaR, r2, false, boxSize, invBoxSize);
float r = sqrtf(r2);
float inverseR = 1/r;
float chargeProd = ONE_4PI_EPS0*posq[4*i+3]*posq[4*j+3];
float alphaR = alphaEwald*r;
float erfcAlphaR = erfcApprox(alphaR)[0];
float dEdR = (float) (chargeProd * inverseR * inverseR * inverseR);
dEdR = (float) (dEdR * (1.0f-erfcAlphaR-TWO_OVER_SQRT_PI*alphaR*exp(-alphaR*alphaR)));
fvec4 result = deltaR*dEdR;
(fvec4(&threadForce[4*i])-result).store(&threadForce[4*i]);
(fvec4(&threadForce[4*j])+result).store(&threadForce[4*j]);
if (includeEnergy)
threadEnergy -= chargeProd*inverseR*(1.0f-erfcAlphaR);
}
for (int i = threadIndex; i < numberOfAtoms; i += numThreads) {
fvec4 posI((float) atomCoordinates[i][0], (float) atomCoordinates[i][1], (float) atomCoordinates[i][2], 0.0f);
for (set<int>::const_iterator iter = exclusions[i].begin(); iter != exclusions[i].end(); ++iter) {
if (*iter > i) {
int j = *iter;
fvec4 deltaR;
fvec4 posJ((float) atomCoordinates[j][0], (float) atomCoordinates[j][1], (float) atomCoordinates[j][2], 0.0f);
float r2;
getDeltaR(posJ, posI, deltaR, r2, false, boxSize, invBoxSize);
float r = sqrtf(r2);
float inverseR = 1/r;
float chargeProd = ONE_4PI_EPS0*posq[4*i+3]*posq[4*j+3];
float alphaR = alphaEwald*r;
float erfcAlphaR = erfcApprox(alphaR);
float dEdR = (float) (chargeProd * inverseR * inverseR * inverseR);
dEdR = (float) (dEdR * (1.0f-erfcAlphaR-TWO_OVER_SQRT_PI*alphaR*exp(-alphaR*alphaR)));
fvec4 result = deltaR*dEdR;
(fvec4(forces+4*i)-result).store(forces+4*i);
(fvec4(forces+4*j)+result).store(forces+4*j);
if (includeEnergy)
threadEnergy[threadIndex] -= chargeProd*inverseR*(1.0f-erfcAlphaR);
}
}
}
else if (cutoff) {
// Compute the interactions from the neighbor list.
for (int i = index; i < neighborList->getNumBlocks(); i += numThreads)
calculateBlockIxn(i, &threadForce[0], energyPtr, boxSize, invBoxSize);
}
else if (cutoff) {
// Compute the interactions from the neighbor list.
while (true) {
int nextBlock = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 1);
if (nextBlock >= neighborList->getNumBlocks())
break;
calculateBlockIxn(nextBlock, forces, energyPtr, boxSize, invBoxSize);
}
else {
// Loop over all atom pairs
for (int i = index; i < numberOfAtoms; i += numThreads){
for (int j = i+1; j < numberOfAtoms; j++)
if (exclusions[j].find(i) == exclusions[j].end())
calculateOneIxn(i, j, &threadForce[0], energyPtr, boxSize, invBoxSize);
}
}
else {
// Loop over all atom pairs
while (true) {
int i = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 1);
if (i >= numberOfAtoms)
break;
for (int j = i+1; j < numberOfAtoms; j++)
if (exclusions[j].find(i) == exclusions[j].end())
calculateOneIxn(i, j, forces, energyPtr, boxSize, invBoxSize);
}
}
}
......@@ -496,198 +449,6 @@ void CpuNonbondedForce::calculateOneIxn(int ii, int jj, float* forces, double* t
(fvec4(forces+4*jj)-result).store(forces+4*jj);
}
void CpuNonbondedForce::calculateBlockIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
// Load the positions and parameters of the atoms in the block.
int blockAtom[4];
fvec4 blockAtomPosq[4];
fvec4 blockAtomForce[4];
for (int i = 0; i < 4; i++) {
blockAtom[i] = neighborList->getSortedAtoms()[4*blockIndex+i];
blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]);
blockAtomForce[i] = fvec4(0.0f);
}
fvec4 blockAtomCharge = fvec4(ONE_4PI_EPS0)*fvec4(blockAtomPosq[0][3], blockAtomPosq[1][3], blockAtomPosq[2][3], blockAtomPosq[3][3]);
fvec4 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first);
fvec4 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second);
// Loop over neighbors for this block.
const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
const vector<char>& exclusions = neighborList->getBlockExclusions(blockIndex);
float blockAtomR2[4];
bool include[4];
fvec4 blockAtomDelta[4];
for (int i = 0; i < (int) neighbors.size(); i++) {
// Load the next neighbor.
int atom = neighbors[i];
fvec4 atomPosq(posq+4*atom);
// Compute the distances to the block atoms.
bool any = false;
for (int j = 0; j < 4; j++) {
getDeltaR(atomPosq, blockAtomPosq[j], blockAtomDelta[j], blockAtomR2[j], periodic, boxSize, invBoxSize);
include[j] = (((exclusions[i]>>j)&1) == 0 && (!cutoff || blockAtomR2[j] < cutoffDistance*cutoffDistance));
any |= include[j];
}
if (!any)
continue; // No interactions to compute.
// Compute the interactions.
fvec4 r2(blockAtomR2);
fvec4 r = sqrt(r2);
fvec4 inverseR = fvec4(1.0f)/r;
fvec4 switchValue(1.0f), switchDeriv(0.0f);
if (useSwitch) {
fvec4 t = (r>switchingDistance) & ((r-switchingDistance)/(cutoffDistance-switchingDistance));
switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f));
switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))/(cutoffDistance-switchingDistance);
}
fvec4 sig = blockAtomSigma+atomParameters[atom].first;
fvec4 sig2 = inverseR*sig;
sig2 *= sig2;
fvec4 sig6 = sig2*sig2*sig2;
fvec4 eps = blockAtomEpsilon*atomParameters[atom].second;
fvec4 dEdR = switchValue*eps*(12.0f*sig6 - 6.0f)*sig6;
fvec4 chargeProd = blockAtomCharge*posq[4*atom+3];
if (cutoff)
dEdR += chargeProd*(inverseR-2.0f*krf*r2);
else
dEdR += chargeProd*inverseR;
dEdR *= inverseR*inverseR;
fvec4 energy = eps*(sig6-1.0f)*sig6;
if (useSwitch) {
dEdR -= energy*switchDeriv*inverseR;
energy *= switchValue;
}
// Accumulate energies.
if (totalEnergy) {
if (cutoff)
energy += chargeProd*(inverseR+krf*r2-crf);
else
energy += chargeProd*inverseR;
for (int j = 0; j < 4; j++)
if (include[j])
*totalEnergy += energy[j];
}
// Accumulate forces.
fvec4 atomForce(forces+4*atom);
for (int j = 0; j < 4; j++) {
if (include[j]) {
fvec4 result = blockAtomDelta[j]*dEdR[j];
blockAtomForce[j] += result;
atomForce -= result;
}
}
atomForce.store(forces+4*atom);
}
// Record the forces on the block atoms.
for (int j = 0; j < 4; j++)
(fvec4(forces+4*blockAtom[j])+blockAtomForce[j]).store(forces+4*blockAtom[j]);
}
void CpuNonbondedForce::calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
// Load the positions and parameters of the atoms in the block.
int blockAtom[4];
fvec4 blockAtomPosq[4];
fvec4 blockAtomForce[4];
for (int i = 0; i < 4; i++) {
blockAtom[i] = neighborList->getSortedAtoms()[4*blockIndex+i];
blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]);
blockAtomForce[i] = fvec4(0.0f);
}
fvec4 blockAtomCharge = fvec4(ONE_4PI_EPS0)*fvec4(blockAtomPosq[0][3], blockAtomPosq[1][3], blockAtomPosq[2][3], blockAtomPosq[3][3]);
fvec4 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first);
fvec4 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second);
// Loop over neighbors for this block.
const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
const vector<char>& exclusions = neighborList->getBlockExclusions(blockIndex);
float blockAtomR2[4];
bool include[4];
fvec4 blockAtomDelta[4];
for (int i = 0; i < (int) neighbors.size(); i++) {
// Load the next neighbor.
int atom = neighbors[i];
fvec4 atomPosq(posq+4*atom);
// Compute the distances to the block atoms.
bool any = false;
for (int j = 0; j < 4; j++) {
getDeltaR(atomPosq, blockAtomPosq[j], blockAtomDelta[j], blockAtomR2[j], periodic, boxSize, invBoxSize);
include[j] = (((exclusions[i]>>j)&1) == 0 && blockAtomR2[j] < cutoffDistance*cutoffDistance);
any |= include[j];
}
if (!any)
continue; // No interactions to compute.
// Compute the interactions.
fvec4 r2(blockAtomR2);
fvec4 r = sqrt(r2);
fvec4 inverseR = fvec4(1.0f)/r;
fvec4 switchValue(1.0f), switchDeriv(0.0f);
if (useSwitch) {
fvec4 t = (r>switchingDistance) & ((r-switchingDistance)/(cutoffDistance-switchingDistance));
switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f));
switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))/(cutoffDistance-switchingDistance);
}
fvec4 chargeProd = blockAtomCharge*posq[4*atom+3];
fvec4 dEdR = chargeProd*inverseR*ewaldScaleFunction(r);
fvec4 sig = blockAtomSigma+atomParameters[atom].first;
fvec4 sig2 = inverseR*sig;
sig2 *= sig2;
fvec4 sig6 = sig2*sig2*sig2;
fvec4 eps = blockAtomEpsilon*atomParameters[atom].second;
dEdR += switchValue*eps*(12.0f*sig6 - 6.0f)*sig6;
dEdR *= inverseR*inverseR;
fvec4 energy = eps*(sig6-1.0f)*sig6;
if (useSwitch) {
dEdR -= energy*switchDeriv*inverseR;
energy *= switchValue;
}
// Accumulate energies.
if (totalEnergy) {
energy += chargeProd*inverseR*erfcApprox(alphaEwald*r);
for (int j = 0; j < 4; j++)
if (include[j])
*totalEnergy += energy[j];
}
// Accumulate forces.
fvec4 atomForce(forces+4*atom);
for (int j = 0; j < 4; j++) {
if (include[j]) {
fvec4 result = blockAtomDelta[j]*dEdR[j];
blockAtomForce[j] += result;
atomForce -= result;
}
}
atomForce.store(forces+4*atom);
}
// Record the forces on the block atoms.
for (int j = 0; j < 4; j++)
(fvec4(forces+4*blockAtom[j])+blockAtomForce[j]).store(forces+4*blockAtom[j]);
}
void CpuNonbondedForce::getDeltaR(const fvec4& posI, const fvec4& posJ, fvec4& deltaR, float& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const {
deltaR = posJ-posI;
if (periodic) {
......@@ -697,33 +458,15 @@ void CpuNonbondedForce::getDeltaR(const fvec4& posI, const fvec4& posJ, fvec4& d
r2 = dot3(deltaR, deltaR);
}
fvec4 CpuNonbondedForce::erfcApprox(fvec4 x) {
float CpuNonbondedForce::erfcApprox(float x) {
// This approximation for erfc is from Abramowitz and Stegun (1964) p. 299. They cite the following as
// the original source: C. Hastings, Jr., Approximations for Digital Computers (1955). It has a maximum
// error of 3e-7.
fvec4 t = 1.0f+(0.0705230784f+(0.0422820123f+(0.0092705272f+(0.0001520143f+(0.0002765672f+0.0000430638f*x)*x)*x)*x)*x)*x;
float t = 1.0f+(0.0705230784f+(0.0422820123f+(0.0092705272f+(0.0001520143f+(0.0002765672f+0.0000430638f*x)*x)*x)*x)*x)*x;
t *= t;
t *= t;
t *= t;
return 1.0f/(t*t);
}
fvec4 CpuNonbondedForce::ewaldScaleFunction(fvec4 x) {
// Compute the tabulated Ewald scale factor: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI)
float y[4];
fvec4 x1 = x*ewaldDXInv;
ivec4 index = floor(x1);
fvec4 coeff[4];
coeff[1] = x1-index;
coeff[0] = 1.0f-coeff[1];
coeff[2] = coeff[0]*coeff[0]*coeff[0]-coeff[0];
coeff[3] = coeff[1]*coeff[1]*coeff[1]-coeff[1];
_MM_TRANSPOSE4_PS(coeff[0], coeff[1], coeff[2], coeff[3]);
for (int i = 0; i < 4; i++) {
if (index[i] < NUM_TABLE_POINTS)
y[i] = dot4(coeff[i], fvec4(&ewaldScaleTable[4*index[i]]));
}
return fvec4(y);
}
/* Portions copyright (c) 2006-2013 Stanford University and Simbios.
* Contributors: Pande Group
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject
* to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE
* LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
* OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
* WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include "SimTKOpenMMCommon.h"
#include "SimTKOpenMMUtilities.h"
#include "CpuNonbondedForceVec4.h"
using namespace std;
using namespace OpenMM;
/**
* Factory method to create a CpuNonbondedForceVec4.
*/
CpuNonbondedForce* createCpuNonbondedForceVec4() {
return new CpuNonbondedForceVec4();
}
/**---------------------------------------------------------------------------------------
CpuNonbondedForceVec4 constructor
--------------------------------------------------------------------------------------- */
CpuNonbondedForceVec4::CpuNonbondedForceVec4() {
}
void CpuNonbondedForceVec4::calculateBlockIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
// Load the positions and parameters of the atoms in the block.
int blockAtom[4];
fvec4 blockAtomPosq[4];
fvec4 blockAtomForceX(0.0f), blockAtomForceY(0.0f), blockAtomForceZ(0.0f);
for (int i = 0; i < 4; i++) {
blockAtom[i] = neighborList->getSortedAtoms()[4*blockIndex+i];
blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]);
}
fvec4 blockAtomX = fvec4(blockAtomPosq[0][0], blockAtomPosq[1][0], blockAtomPosq[2][0], blockAtomPosq[3][0]);
fvec4 blockAtomY = fvec4(blockAtomPosq[0][1], blockAtomPosq[1][1], blockAtomPosq[2][1], blockAtomPosq[3][1]);
fvec4 blockAtomZ = fvec4(blockAtomPosq[0][2], blockAtomPosq[1][2], blockAtomPosq[2][2], blockAtomPosq[3][2]);
fvec4 blockAtomCharge = fvec4(ONE_4PI_EPS0)*fvec4(blockAtomPosq[0][3], blockAtomPosq[1][3], blockAtomPosq[2][3], blockAtomPosq[3][3]);
fvec4 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first);
fvec4 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second);
bool needPeriodic = (periodic && (any(blockAtomX < cutoffDistance) || any(blockAtomY < cutoffDistance) || any(blockAtomZ < cutoffDistance) ||
any(blockAtomX > boxSize[0]-cutoffDistance) || any(blockAtomY > boxSize[1]-cutoffDistance) || any(blockAtomZ > boxSize[2]-cutoffDistance)));
const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance);
// Loop over neighbors for this block.
const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
const vector<char>& exclusions = neighborList->getBlockExclusions(blockIndex);
for (int i = 0; i < (int) neighbors.size(); i++) {
// Load the next neighbor.
int atom = neighbors[i];
// Compute the distances to the block atoms.
fvec4 dx, dy, dz, r2;
getDeltaR(posq+4*atom, blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, needPeriodic, boxSize, invBoxSize);
ivec4 include;
char excl = exclusions[i];
if (excl == 0)
include = -1;
else
include = ivec4(excl&1 ? 0 : -1, excl&2 ? 0 : -1, excl&4 ? 0 : -1, excl&8 ? 0 : -1);
include = include & (r2 < cutoffDistance*cutoffDistance);
if (!any(include))
continue; // No interactions to compute.
// Compute the interactions.
fvec4 r = sqrt(r2);
fvec4 inverseR = fvec4(1.0f)/r;
fvec4 energy, dEdR;
float atomEpsilon = atomParameters[atom].second;
if (atomEpsilon != 0.0f) {
fvec4 sig = blockAtomSigma+atomParameters[atom].first;
fvec4 sig2 = inverseR*sig;
sig2 *= sig2;
fvec4 sig6 = sig2*sig2*sig2;
fvec4 epsSig6 = blockAtomEpsilon*atomEpsilon*sig6;
dEdR = epsSig6*(12.0f*sig6 - 6.0f);
energy = epsSig6*(sig6-1.0f);
if (useSwitch) {
fvec4 t = (r>switchingDistance) & ((r-switchingDistance)*invSwitchingInterval);
fvec4 switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f));
fvec4 switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))*invSwitchingInterval;
dEdR = switchValue*dEdR - energy*switchDeriv*r;
energy *= switchValue;
}
}
else {
energy = 0.0f;
dEdR = 0.0f;
}
fvec4 chargeProd = blockAtomCharge*posq[4*atom+3];
if (cutoff)
dEdR += chargeProd*(inverseR-2.0f*krf*r2);
else
dEdR += chargeProd*inverseR;
dEdR *= inverseR*inverseR;
// Accumulate energies.
fvec4 one(1.0f);
if (totalEnergy) {
if (cutoff)
energy += chargeProd*(inverseR+krf*r2-crf);
else
energy += chargeProd*inverseR;
energy = blend(0.0f, energy, include);
*totalEnergy += dot4(energy, one);
}
// Accumulate forces.
dEdR = blend(0.0f, dEdR, include);
fvec4 fx = dx*dEdR;
fvec4 fy = dy*dEdR;
fvec4 fz = dz*dEdR;
blockAtomForceX += fx;
blockAtomForceY += fy;
blockAtomForceZ += fz;
float* atomForce = forces+4*atom;
atomForce[0] -= dot4(fx, one);
atomForce[1] -= dot4(fy, one);
atomForce[2] -= dot4(fz, one);
}
// Record the forces on the block atoms.
fvec4 f[4] = {blockAtomForceX, blockAtomForceY, blockAtomForceZ, 0.0f};
transpose(f[0], f[1], f[2], f[3]);
for (int j = 0; j < 4; j++)
(fvec4(forces+4*blockAtom[j])+f[j]).store(forces+4*blockAtom[j]);
}
void CpuNonbondedForceVec4::calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
// Load the positions and parameters of the atoms in the block.
int blockAtom[4];
fvec4 blockAtomPosq[4];
fvec4 blockAtomForceX(0.0f), blockAtomForceY(0.0f), blockAtomForceZ(0.0f);
for (int i = 0; i < 4; i++) {
blockAtom[i] = neighborList->getSortedAtoms()[4*blockIndex+i];
blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]);
}
fvec4 blockAtomX = fvec4(blockAtomPosq[0][0], blockAtomPosq[1][0], blockAtomPosq[2][0], blockAtomPosq[3][0]);
fvec4 blockAtomY = fvec4(blockAtomPosq[0][1], blockAtomPosq[1][1], blockAtomPosq[2][1], blockAtomPosq[3][1]);
fvec4 blockAtomZ = fvec4(blockAtomPosq[0][2], blockAtomPosq[1][2], blockAtomPosq[2][2], blockAtomPosq[3][2]);
fvec4 blockAtomCharge = fvec4(ONE_4PI_EPS0)*fvec4(blockAtomPosq[0][3], blockAtomPosq[1][3], blockAtomPosq[2][3], blockAtomPosq[3][3]);
fvec4 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first);
fvec4 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second);
bool needPeriodic = (periodic && (any(blockAtomX < cutoffDistance) || any(blockAtomY < cutoffDistance) || any(blockAtomZ < cutoffDistance) ||
any(blockAtomX > boxSize[0]-cutoffDistance) || any(blockAtomY > boxSize[1]-cutoffDistance) || any(blockAtomZ > boxSize[2]-cutoffDistance)));
const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance);
// Loop over neighbors for this block.
const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
const vector<char>& exclusions = neighborList->getBlockExclusions(blockIndex);
for (int i = 0; i < (int) neighbors.size(); i++) {
// Load the next neighbor.
int atom = neighbors[i];
// Compute the distances to the block atoms.
fvec4 dx, dy, dz, r2;
getDeltaR(posq+4*atom, blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, needPeriodic, boxSize, invBoxSize);
ivec4 include;
char excl = exclusions[i];
if (excl == 0)
include = -1;
else
include = ivec4(excl&1 ? 0 : -1, excl&2 ? 0 : -1, excl&4 ? 0 : -1, excl&8 ? 0 : -1);
include = include & (r2 < cutoffDistance*cutoffDistance);
if (!any(include))
continue; // No interactions to compute.
// Compute the interactions.
fvec4 r = sqrt(r2);
fvec4 inverseR = fvec4(1.0f)/r;
fvec4 energy, dEdR;
float atomEpsilon = atomParameters[atom].second;
if (atomEpsilon != 0.0f) {
fvec4 sig = blockAtomSigma+atomParameters[atom].first;
fvec4 sig2 = inverseR*sig;
sig2 *= sig2;
fvec4 sig6 = sig2*sig2*sig2;
fvec4 epsSig6 = blockAtomEpsilon*atomEpsilon*sig6;
dEdR = epsSig6*(12.0f*sig6 - 6.0f);
energy = epsSig6*(sig6-1.0f);
if (useSwitch) {
fvec4 t = (r>switchingDistance) & ((r-switchingDistance)*invSwitchingInterval);
fvec4 switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f));
fvec4 switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))*invSwitchingInterval;
dEdR = switchValue*dEdR - energy*switchDeriv*r;
energy *= switchValue;
}
}
else {
energy = 0.0f;
dEdR = 0.0f;
}
fvec4 chargeProd = blockAtomCharge*posq[4*atom+3];
dEdR += chargeProd*inverseR*ewaldScaleFunction(r);
dEdR *= inverseR*inverseR;
// Accumulate energies.
fvec4 one(1.0f);
if (totalEnergy) {
energy += chargeProd*inverseR*erfcApprox(alphaEwald*r);
energy = blend(0.0f, energy, include);
*totalEnergy += dot4(energy, one);
}
// Accumulate forces.
dEdR = blend(0.0f, dEdR, include);
fvec4 fx = dx*dEdR;
fvec4 fy = dy*dEdR;
fvec4 fz = dz*dEdR;
blockAtomForceX += fx;
blockAtomForceY += fy;
blockAtomForceZ += fz;
float* atomForce = forces+4*atom;
atomForce[0] -= dot4(fx, one);
atomForce[1] -= dot4(fy, one);
atomForce[2] -= dot4(fz, one);
}
// Record the forces on the block atoms.
fvec4 f[4] = {blockAtomForceX, blockAtomForceY, blockAtomForceZ, 0.0f};
transpose(f[0], f[1], f[2], f[3]);
for (int j = 0; j < 4; j++)
(fvec4(forces+4*blockAtom[j])+f[j]).store(forces+4*blockAtom[j]);
}
void CpuNonbondedForceVec4::getDeltaR(const float* posI, const fvec4& x, const fvec4& y, const fvec4& z, fvec4& dx, fvec4& dy, fvec4& dz, fvec4& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const {
dx = x-posI[0];
dy = y-posI[1];
dz = z-posI[2];
if (periodic) {
dx -= round(dx*invBoxSize[0])*boxSize[0];
dy -= round(dy*invBoxSize[1])*boxSize[1];
dz -= round(dz*invBoxSize[2])*boxSize[2];
}
r2 = dx*dx + dy*dy + dz*dz;
}
fvec4 CpuNonbondedForceVec4::erfcApprox(const fvec4& x) {
// This approximation for erfc is from Abramowitz and Stegun (1964) p. 299. They cite the following as
// the original source: C. Hastings, Jr., Approximations for Digital Computers (1955). It has a maximum
// error of 3e-7.
fvec4 t = 1.0f+(0.0705230784f+(0.0422820123f+(0.0092705272f+(0.0001520143f+(0.0002765672f+0.0000430638f*x)*x)*x)*x)*x)*x;
t *= t;
t *= t;
t *= t;
return 1.0f/(t*t);
}
fvec4 CpuNonbondedForceVec4::ewaldScaleFunction(const fvec4& x) {
// Compute the tabulated Ewald scale factor: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI)
fvec4 x1 = x*ewaldDXInv;
ivec4 index = min(floor(x1), NUM_TABLE_POINTS);
fvec4 coeff2 = x1-index;
fvec4 coeff1 = 1.0f-coeff2;
fvec4 t1(&ewaldScaleTable[index[0]]);
fvec4 t2(&ewaldScaleTable[index[1]]);
fvec4 t3(&ewaldScaleTable[index[2]]);
fvec4 t4(&ewaldScaleTable[index[3]]);
transpose(t1, t2, t3, t4);
return coeff1*t1 + coeff2*t2;
}
/* Portions copyright (c) 2006-2013 Stanford University and Simbios.
* Contributors: Pande Group
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject
* to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE
* LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
* OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
* WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include "SimTKOpenMMCommon.h"
#include "SimTKOpenMMUtilities.h"
#include "CpuNonbondedForceVec8.h"
#include "openmm/OpenMMException.h"
#include "openmm/internal/hardware.h"
using namespace std;
using namespace OpenMM;
#ifndef __AVX__
bool isVec8Supported() {
return false;
}
CpuNonbondedForce* createCpuNonbondedForceVec8() {
throw OpenMMException("Internal error: OpenMM was compiled without AVX support");
}
#else
/**
* Check whether 8 component vectors are supported with the current CPU.
*/
bool isVec8Supported() {
// Make sure the CPU supports AVX.
int cpuInfo[4];
cpuid(cpuInfo, 0);
if (cpuInfo[0] >= 1) {
cpuid(cpuInfo, 1);
return ((cpuInfo[2] & ((int) 1 << 28)) != 0);
}
return false;
}
/**
* Factory method to create a CpuNonbondedForceVec8.
*/
CpuNonbondedForce* createCpuNonbondedForceVec8() {
return new CpuNonbondedForceVec8();
}
/**---------------------------------------------------------------------------------------
CpuNonbondedForceVec8 constructor
--------------------------------------------------------------------------------------- */
CpuNonbondedForceVec8::CpuNonbondedForceVec8() {
}
void CpuNonbondedForceVec8::calculateBlockIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
// Load the positions and parameters of the atoms in the block.
int blockAtom[8];
fvec4 blockAtomPosq[8];
fvec8 blockAtomForceX(0.0f), blockAtomForceY(0.0f), blockAtomForceZ(0.0f);
fvec8 blockAtomX, blockAtomY, blockAtomZ, blockAtomCharge;
for (int i = 0; i < 8; i++) {
blockAtom[i] = neighborList->getSortedAtoms()[8*blockIndex+i];
blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]);
}
transpose(blockAtomPosq[0], blockAtomPosq[1], blockAtomPosq[2], blockAtomPosq[3], blockAtomPosq[4], blockAtomPosq[5], blockAtomPosq[6], blockAtomPosq[7], blockAtomX, blockAtomY, blockAtomZ, blockAtomCharge);
blockAtomCharge *= ONE_4PI_EPS0;
fvec8 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first, atomParameters[blockAtom[4]].first, atomParameters[blockAtom[5]].first, atomParameters[blockAtom[6]].first, atomParameters[blockAtom[7]].first);
fvec8 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second, atomParameters[blockAtom[4]].second, atomParameters[blockAtom[5]].second, atomParameters[blockAtom[6]].second, atomParameters[blockAtom[7]].second);
bool needPeriodic = (periodic && (any(blockAtomX < cutoffDistance) || any(blockAtomY < cutoffDistance) || any(blockAtomZ < cutoffDistance) ||
any(blockAtomX > boxSize[0]-cutoffDistance) || any(blockAtomY > boxSize[1]-cutoffDistance) || any(blockAtomZ > boxSize[2]-cutoffDistance)));
const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance);
// Loop over neighbors for this block.
const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
const vector<char>& exclusions = neighborList->getBlockExclusions(blockIndex);
for (int i = 0; i < (int) neighbors.size(); i++) {
// Load the next neighbor.
int atom = neighbors[i];
// Compute the distances to the block atoms.
fvec8 dx, dy, dz, r2;
getDeltaR(&posq[4*atom], blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, needPeriodic, boxSize, invBoxSize);
ivec8 include;
char excl = exclusions[i];
if (excl == 0)
include = -1;
else
include = ivec8(excl&1 ? 0 : -1, excl&2 ? 0 : -1, excl&4 ? 0 : -1, excl&8 ? 0 : -1, excl&16 ? 0 : -1, excl&32 ? 0 : -1, excl&64 ? 0 : -1, excl&128 ? 0 : -1);
include = include & (r2 < cutoffDistance*cutoffDistance);
if (!any(include))
continue; // No interactions to compute.
// Compute the interactions.
fvec8 r = sqrt(r2);
fvec8 inverseR = fvec8(1.0f)/r;
fvec8 energy, dEdR;
float atomEpsilon = atomParameters[atom].second;
if (atomEpsilon != 0.0f) {
fvec8 sig = blockAtomSigma+atomParameters[atom].first;
fvec8 sig2 = inverseR*sig;
sig2 *= sig2;
fvec8 sig6 = sig2*sig2*sig2;
fvec8 epsSig6 = blockAtomEpsilon*atomEpsilon*sig6;
dEdR = epsSig6*(12.0f*sig6 - 6.0f);
energy = epsSig6*(sig6-1.0f);
if (useSwitch) {
fvec8 t = (r>switchingDistance) & ((r-switchingDistance)*invSwitchingInterval);
fvec8 switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f));
fvec8 switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))*invSwitchingInterval;
dEdR = switchValue*dEdR - energy*switchDeriv*r;
energy *= switchValue;
}
}
else {
energy = 0.0f;
dEdR = 0.0f;
}
fvec8 chargeProd = blockAtomCharge*posq[4*atom+3];
if (cutoff)
dEdR += chargeProd*(inverseR-2.0f*krf*r2);
else
dEdR += chargeProd*inverseR;
dEdR *= inverseR*inverseR;
// Accumulate energies.
fvec8 one(1.0f);
if (totalEnergy) {
if (cutoff)
energy += chargeProd*(inverseR+krf*r2-crf);
else
energy += chargeProd*inverseR;
energy = blend(0.0f, energy, include);
*totalEnergy += dot8(energy, one);
}
// Accumulate forces.
dEdR = blend(0.0f, dEdR, include);
fvec8 fx = dx*dEdR;
fvec8 fy = dy*dEdR;
fvec8 fz = dz*dEdR;
blockAtomForceX += fx;
blockAtomForceY += fy;
blockAtomForceZ += fz;
float* atomForce = forces+4*atom;
atomForce[0] -= dot8(fx, one);
atomForce[1] -= dot8(fy, one);
atomForce[2] -= dot8(fz, one);
}
// Record the forces on the block atoms.
fvec4 f[8];
transpose(blockAtomForceX, blockAtomForceY, blockAtomForceZ, 0.0f, f[0], f[1], f[2], f[3], f[4], f[5], f[6], f[7]);
for (int j = 0; j < 8; j++)
(fvec4(forces+4*blockAtom[j])+f[j]).store(forces+4*blockAtom[j]);
}
void CpuNonbondedForceVec8::calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
// Load the positions and parameters of the atoms in the block.
int blockAtom[8];
fvec4 blockAtomPosq[8];
fvec8 blockAtomForceX(0.0f), blockAtomForceY(0.0f), blockAtomForceZ(0.0f);
fvec8 blockAtomX, blockAtomY, blockAtomZ, blockAtomCharge;
for (int i = 0; i < 8; i++) {
blockAtom[i] = neighborList->getSortedAtoms()[8*blockIndex+i];
blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]);
}
transpose(blockAtomPosq[0], blockAtomPosq[1], blockAtomPosq[2], blockAtomPosq[3], blockAtomPosq[4], blockAtomPosq[5], blockAtomPosq[6], blockAtomPosq[7], blockAtomX, blockAtomY, blockAtomZ, blockAtomCharge);
blockAtomCharge *= ONE_4PI_EPS0;
fvec8 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first, atomParameters[blockAtom[4]].first, atomParameters[blockAtom[5]].first, atomParameters[blockAtom[6]].first, atomParameters[blockAtom[7]].first);
fvec8 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second, atomParameters[blockAtom[4]].second, atomParameters[blockAtom[5]].second, atomParameters[blockAtom[6]].second, atomParameters[blockAtom[7]].second);
bool needPeriodic = (periodic && (any(blockAtomX < cutoffDistance) || any(blockAtomY < cutoffDistance) || any(blockAtomZ < cutoffDistance) ||
any(blockAtomX > boxSize[0]-cutoffDistance) || any(blockAtomY > boxSize[1]-cutoffDistance) || any(blockAtomZ > boxSize[2]-cutoffDistance)));
const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance);
// Loop over neighbors for this block.
const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
const vector<char>& exclusions = neighborList->getBlockExclusions(blockIndex);
for (int i = 0; i < (int) neighbors.size(); i++) {
// Load the next neighbor.
int atom = neighbors[i];
// Compute the distances to the block atoms.
fvec8 dx, dy, dz, r2;
getDeltaR(&posq[4*atom], blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, needPeriodic, boxSize, invBoxSize);
ivec8 include;
char excl = exclusions[i];
if (excl == 0)
include = -1;
else
include = ivec8(excl&1 ? 0 : -1, excl&2 ? 0 : -1, excl&4 ? 0 : -1, excl&8 ? 0 : -1, excl&16 ? 0 : -1, excl&32 ? 0 : -1, excl&64 ? 0 : -1, excl&128 ? 0 : -1);
include = include & (r2 < cutoffDistance*cutoffDistance);
if (!any(include))
continue; // No interactions to compute.
// Compute the interactions.
fvec8 r = sqrt(r2);
fvec8 inverseR = fvec8(1.0f)/r;
fvec8 energy, dEdR;
float atomEpsilon = atomParameters[atom].second;
if (atomEpsilon != 0.0f) {
fvec8 sig = blockAtomSigma+atomParameters[atom].first;
fvec8 sig2 = inverseR*sig;
sig2 *= sig2;
fvec8 sig6 = sig2*sig2*sig2;
fvec8 epsSig6 = blockAtomEpsilon*atomEpsilon*sig6;
dEdR = epsSig6*(12.0f*sig6 - 6.0f);
energy = epsSig6*(sig6-1.0f);
if (useSwitch) {
fvec8 t = (r>switchingDistance) & ((r-switchingDistance)*invSwitchingInterval);
fvec8 switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f));
fvec8 switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))*invSwitchingInterval;
dEdR = switchValue*dEdR - energy*switchDeriv*r;
energy *= switchValue;
}
}
else {
energy = 0.0f;
dEdR = 0.0f;
}
fvec8 chargeProd = blockAtomCharge*posq[4*atom+3];
dEdR += chargeProd*inverseR*ewaldScaleFunction(r);
dEdR *= inverseR*inverseR;
// Accumulate energies.
fvec8 one(1.0f);
if (totalEnergy) {
energy += chargeProd*inverseR*erfcApprox(alphaEwald*r);
energy = blend(0.0f, energy, include);
*totalEnergy += dot8(energy, one);
}
// Accumulate forces.
dEdR = blend(0.0f, dEdR, include);
fvec8 fx = dx*dEdR;
fvec8 fy = dy*dEdR;
fvec8 fz = dz*dEdR;
blockAtomForceX += fx;
blockAtomForceY += fy;
blockAtomForceZ += fz;
float* atomForce = forces+4*atom;
atomForce[0] -= dot8(fx, one);
atomForce[1] -= dot8(fy, one);
atomForce[2] -= dot8(fz, one);
}
// Record the forces on the block atoms.
fvec4 f[8];
transpose(blockAtomForceX, blockAtomForceY, blockAtomForceZ, 0.0f, f[0], f[1], f[2], f[3], f[4], f[5], f[6], f[7]);
for (int j = 0; j < 8; j++)
(fvec4(forces+4*blockAtom[j])+f[j]).store(forces+4*blockAtom[j]);
}
void CpuNonbondedForceVec8::getDeltaR(const float* posI, const fvec8& x, const fvec8& y, const fvec8& z, fvec8& dx, fvec8& dy, fvec8& dz, fvec8& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const {
dx = x-posI[0];
dy = y-posI[1];
dz = z-posI[2];
if (periodic) {
dx -= round(dx*invBoxSize[0])*boxSize[0];
dy -= round(dy*invBoxSize[1])*boxSize[1];
dz -= round(dz*invBoxSize[2])*boxSize[2];
}
r2 = dx*dx + dy*dy + dz*dz;
}
fvec8 CpuNonbondedForceVec8::erfcApprox(const fvec8& x) {
// This approximation for erfc is from Abramowitz and Stegun (1964) p. 299. They cite the following as
// the original source: C. Hastings, Jr., Approximations for Digital Computers (1955). It has a maximum
// error of 3e-7.
fvec8 t = 1.0f+(0.0705230784f+(0.0422820123f+(0.0092705272f+(0.0001520143f+(0.0002765672f+0.0000430638f*x)*x)*x)*x)*x)*x;
t *= t;
t *= t;
t *= t;
return 1.0f/(t*t);
}
fvec8 CpuNonbondedForceVec8::ewaldScaleFunction(const fvec8& x) {
// Compute the tabulated Ewald scale factor: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI)
fvec8 x1 = x*ewaldDXInv;
ivec8 index = min(floor(x1), NUM_TABLE_POINTS);
fvec8 coeff2 = x1-index;
fvec8 coeff1 = 1.0f-coeff2;
ivec4 indexLower = index.lowerVec();
ivec4 indexUpper = index.upperVec();
fvec4 t1(&ewaldScaleTable[indexLower[0]]);
fvec4 t2(&ewaldScaleTable[indexLower[1]]);
fvec4 t3(&ewaldScaleTable[indexLower[2]]);
fvec4 t4(&ewaldScaleTable[indexLower[3]]);
fvec4 t5(&ewaldScaleTable[indexUpper[0]]);
fvec4 t6(&ewaldScaleTable[indexUpper[1]]);
fvec4 t7(&ewaldScaleTable[indexUpper[2]]);
fvec4 t8(&ewaldScaleTable[indexUpper[3]]);
fvec8 s1, s2, s3, s4;
transpose(t1, t2, t3, t4, t5, t6, t7, t8, s1, s2, s3, s4);
return coeff1*s1 + coeff2*s2;
}
#endif
......@@ -32,9 +32,12 @@
#include "CpuPlatform.h"
#include "CpuKernelFactory.h"
#include "CpuKernels.h"
#include "CpuSETTLE.h"
#include "ReferenceConstraints.h"
#include "openmm/internal/hardware.h"
using namespace OpenMM;
using namespace std;
extern "C" OPENMM_EXPORT_CPU void registerPlatforms() {
// Only register this platform if the CPU supports SSE 4.1.
......@@ -43,9 +46,16 @@ extern "C" OPENMM_EXPORT_CPU void registerPlatforms() {
Platform::registerPlatform(new CpuPlatform());
}
map<ContextImpl*, CpuPlatform::PlatformData*> CpuPlatform::contextData;
CpuPlatform::CpuPlatform() {
CpuKernelFactory* factory = new CpuKernelFactory();
registerKernelFactory(CalcForcesAndEnergyKernel::Name(), factory);
registerKernelFactory(CalcPeriodicTorsionForceKernel::Name(), factory);
registerKernelFactory(CalcRBTorsionForceKernel::Name(), factory);
registerKernelFactory(CalcNonbondedForceKernel::Name(), factory);
registerKernelFactory(CalcGBSAOBCForceKernel::Name(), factory);
registerKernelFactory(IntegrateLangevinStepKernel::Name(), factory);
}
double CpuPlatform::getSpeed() const {
......@@ -67,3 +77,33 @@ bool CpuPlatform::isProcessorSupported() {
}
return false;
}
void CpuPlatform::contextCreated(ContextImpl& context, const map<string, string>& properties) const {
ReferencePlatform::contextCreated(context, properties);
PlatformData* data = new PlatformData(context.getSystem().getNumParticles());
contextData[&context] = data;
ReferenceConstraints& constraints = *(ReferenceConstraints*) reinterpret_cast<ReferencePlatform::PlatformData*>(context.getPlatformData())->constraints;
if (constraints.settle != NULL) {
CpuSETTLE* parallelSettle = new CpuSETTLE(context.getSystem(), *(ReferenceSETTLEAlgorithm*) constraints.settle, data->threads);
delete constraints.settle;
constraints.settle = parallelSettle;
}
}
void CpuPlatform::contextDestroyed(ContextImpl& context) const {
PlatformData* data = contextData[&context];
delete data;
contextData.erase(&context);
}
CpuPlatform::PlatformData& CpuPlatform::getPlatformData(ContextImpl& context) {
return *contextData[&context];
}
CpuPlatform::PlatformData::PlatformData(int numParticles) : posq(4*numParticles) {
int numThreads = threads.getNumThreads();
threadForce.resize(numThreads);
for (int i = 0; i < numThreads; i++)
threadForce[i].resize(4*numParticles);
isPeriodic = false;
}
/* Portions copyright (c) 2013 Stanford University and Simbios.
* Authors: Peter Eastman
* Contributors:
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject
* to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE
* LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
* OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
* WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include "CpuRandom.h"
#include "openmm/OpenMMException.h"
#include <cmath>
using namespace std;
using namespace OpenMM;
CpuRandom::CpuRandom() : hasInitialized(false) {
}
CpuRandom::~CpuRandom() {
for (int i = 0; i < threadRandom.size(); i++)
delete threadRandom[i];
}
void CpuRandom::initialize(int seed, int numThreads) {
if (hasInitialized) {
if (seed == randomSeed)
return; // Already initialized with the same seed.
throw OpenMMException("Random number generator initialized twice with different seeds");
}
randomSeed = seed;
hasInitialized = true;
threadRandom.resize(numThreads);
nextGaussian.resize(numThreads);
nextGaussianIsValid.resize(numThreads, false);
// Use a quick and dirty RNG to pick seeds for the real random number generator.
unsigned int r = (unsigned int) seed;
for (int i = 0; i < numThreads; i++) {
r = (1664525*r + 1013904223) & 0xFFFFFFFF;
threadRandom[i] = new OpenMM_SFMT::SFMT();
init_gen_rand(r, *threadRandom[i]);
}
}
float CpuRandom::getGaussianRandom(int threadIndex) {
if (nextGaussianIsValid[threadIndex]) {
nextGaussianIsValid[threadIndex] = false;
return nextGaussian[threadIndex];
}
// Use the polar form of the Box-Muller transformation to generate two Gaussian random numbers.
float x, y, r2;
do {
x = 2.0f*(float) genrand_real2(*threadRandom[threadIndex])-1.0f;
y = 2.0f*(float) genrand_real2(*threadRandom[threadIndex])-1.0f;
r2 = x*x + y*y;
} while (r2 >= 1.0f || r2 == 0.0f);
float multiplier = sqrtf((-2.0f*logf(r2))/r2);
nextGaussian[threadIndex] = y*multiplier;
nextGaussianIsValid[threadIndex] = true;
return x*multiplier;
}
float CpuRandom::getUniformRandom(int threadIndex) {
return genrand_real2(*threadRandom[threadIndex]);
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2013 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include "CpuSETTLE.h"
using namespace OpenMM;
using namespace std;
class CpuSETTLE::ApplyToPositionsTask : public ThreadPool::Task {
public:
ApplyToPositionsTask(vector<OpenMM::RealVec>& atomCoordinates, vector<OpenMM::RealVec>& atomCoordinatesP, vector<RealOpenMM>& inverseMasses,
RealOpenMM tolerance, vector<ReferenceSETTLEAlgorithm*>& threadSettle) : atomCoordinates(atomCoordinates), atomCoordinatesP(atomCoordinatesP),
inverseMasses(inverseMasses), tolerance(tolerance), threadSettle(threadSettle) {
}
void execute(ThreadPool& threads, int threadIndex) {
threadSettle[threadIndex]->apply(atomCoordinates, atomCoordinatesP, inverseMasses, tolerance);
}
vector<OpenMM::RealVec>& atomCoordinates;
vector<OpenMM::RealVec>& atomCoordinatesP;
vector<RealOpenMM>& inverseMasses;
RealOpenMM tolerance;
vector<ReferenceSETTLEAlgorithm*>& threadSettle;
};
class CpuSETTLE::ApplyToVelocitiesTask : public ThreadPool::Task {
public:
ApplyToVelocitiesTask(vector<OpenMM::RealVec>& atomCoordinates, vector<OpenMM::RealVec>& velocities, vector<RealOpenMM>& inverseMasses,
RealOpenMM tolerance, vector<ReferenceSETTLEAlgorithm*>& threadSettle) : atomCoordinates(atomCoordinates), velocities(velocities),
inverseMasses(inverseMasses), tolerance(tolerance), threadSettle(threadSettle) {
}
void execute(ThreadPool& threads, int threadIndex) {
threadSettle[threadIndex]->applyToVelocities(atomCoordinates, velocities, inverseMasses, tolerance);
}
vector<OpenMM::RealVec>& atomCoordinates;
vector<OpenMM::RealVec>& velocities;
vector<RealOpenMM>& inverseMasses;
RealOpenMM tolerance;
vector<ReferenceSETTLEAlgorithm*>& threadSettle;
};
CpuSETTLE::CpuSETTLE(const System& system, const ReferenceSETTLEAlgorithm& settle, ThreadPool& threads) : threads(threads) {
int numThreads = threads.getNumThreads();
int numClusters = settle.getNumClusters();
vector<RealOpenMM> mass(system.getNumParticles());
for (int i = 0; i < system.getNumParticles(); i++)
mass[i] = system.getParticleMass(i);
for (int i = 0; i < numThreads; i++) {
int start = i*numClusters/numThreads;
int end = (i+1)*numClusters/numThreads;
if (start != end) {
int numThreadClusters = end-start;
vector<int> atom1(numThreadClusters), atom2(numThreadClusters), atom3(numThreadClusters);
vector<RealOpenMM> distance1(numThreadClusters), distance2(numThreadClusters);
for (int j = 0; j < numThreadClusters; j++)
settle.getClusterParameters(start+j, atom1[j], atom2[j], atom3[j], distance1[j], distance2[j]);
threadSettle.push_back(new ReferenceSETTLEAlgorithm(atom1, atom2, atom3, distance1, distance2, mass));
}
}
}
CpuSETTLE::~CpuSETTLE() {
for (int i = 0; i < (int) threadSettle.size(); i++)
delete threadSettle[i];
}
void CpuSETTLE::apply(vector<OpenMM::RealVec>& atomCoordinates, vector<OpenMM::RealVec>& atomCoordinatesP, vector<RealOpenMM>& inverseMasses, RealOpenMM tolerance) {
ApplyToPositionsTask task(atomCoordinates, atomCoordinatesP, inverseMasses, tolerance, threadSettle);
threads.execute(task);
threads.waitForThreads();
}
void CpuSETTLE::applyToVelocities(vector<OpenMM::RealVec>& atomCoordinates, vector<OpenMM::RealVec>& velocities, vector<RealOpenMM>& inverseMasses, RealOpenMM tolerance) {
ApplyToVelocitiesTask task(atomCoordinates, velocities, inverseMasses, tolerance, threadSettle);
threads.execute(task);
threads.waitForThreads();
}
/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*-
*
* Copyright (c) 2004-2008, Erik Lindahl <lindahl@cbr.su.se>
*
* Unfortunately, some of the constructs in this file are _very_ sensitive
* to compiler optimizations and architecture changes. If you find any such
* errors, please send a message to lindahl@cbr.su.se to help us fix the
* upstream version too.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
* And Hey:
* Gnomes, ROck Monsters And Chili Sauce
*/
#ifndef _GMX_ATOMIC_H_
#define _GMX_ATOMIC_H_
/*! \file gmx_atomic.h
*
* @brief Atomic operations for fast SMP synchronization
*
* This file defines atomic integer operations and spinlocks for
* fast synchronization in performance-critical regions of gromacs.
*
* In general, the best option is to use functions without explicit
* locking, e.g. gmx_atomic_fetch_add() or gmx_atomic_cmpxchg().
*
* Not all architecture support atomic operations though inline assembly,
* and even if they do it might not be implemented here. In that case
* we use a fallback mutex implementation, so you can always count on
* the function interfaces working in Gromacs.
*
* Don't use spinlocks in non-performance-critical regions like file I/O.
* Since they always spin busy they would waste CPU cycles instead of
* properly yielding to a computation thread while waiting for the disk.
*
* Finally, note that all our spinlock operations are defined to return
* 0 if initialization or locking completes successfully.
* This is the opposite of some other implementations, but the same standard
* as used for pthread mutexes. So, if e.g. are trying to lock a spinlock,
* you will have gotten the lock if the return value is 0.
*
* gmx_spinlock_islocked(x) obviously still returns 1 if the lock is locked,
* and 0 if it is available, though...
*/
#include <stdio.h>
#include <pthread.h>
#ifdef __cplusplus
extern "C"
{
#endif
#if 0
} /* Avoids screwing up auto-indentation */
#endif
#if ( ( (defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__PATHSCALE__)) && \
(defined(i386) || defined(__x86_64__)) ) \
|| defined (DOXYGEN) )
/* This code is executed for x86 and x86-64, with these compilers:
* GNU
* Intel
* Pathscale
* All these support GCC-style inline assembly.
* We also use this section for the documentation.
*/
/*! \brief Memory barrier operation
*
* Modern CPUs rely heavily on out-of-order execution, and one common feature
* is that load/stores might be reordered. Also, when using inline assembly
* the compiler might already have loaded the variable we are changing into
* a register, so any update to memory won't be visible.
*
* This command creates a memory barrier, i.e. all memory results before
* it in the code should be visible to all memory operations after it - the
* CPU cannot propagate load/stores across it.
*/
#define gmx_atomic_memory_barrier() __asm__ __volatile__("": : :"memory")
/* Only gcc and Intel support this check, otherwise set it to true (skip doc) */
#if (!defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined DOXYGEN)
#define __builtin_constant_p(i) (1)
#endif
/*! \brief Gromacs atomic operations datatype
*
* Portable synchronization primitives like mutexes are effective for
* many purposes, but usually not very high performance.
* One of the problem is that you have the overhead of a function call,
* and another is that Mutexes often have extra overhead to make the
* scheduling fair. Finally, if performance is important we don't want
* to suspend the thread if we cannot lock a mutex, but spin-lock at 100%
* CPU usage until the resources is available (e.g. increment a counter).
*
* These things can often be implemented with inline-assembly or other
* system-dependent functions, and we provide such functionality for the
* most common platforms. For portability we also have a fallback
* implementation using a mutex for locking.
*
* Performance-wise, the fastest solution is always to avoid locking
* completely (obvious, but remember it!). If you cannot do that, the
* next best thing is to use atomic operations that e.g. increment a
* counter without explicit locking. Spinlocks are useful to lock an
* entire region, but leads to more overhead and can be difficult to
* debug - it is up to you to make sure that only the thread owning the
* lock unlocks it!
*
* You should normally NOT use atomic operations for things like
* I/O threads. These should yield to other threads while waiting for
* the disk instead of spinning at 100% CPU usage.
*
* It is imperative that you use the provided routines for reading
* and writing, since some implementations require memory barriers before
* the CPU or memory sees an updated result. The structure contents is
* only visible here so it can be inlined for performance - it might
* change without further notice.
*
* \note No initialization is required for atomic variables.
*
* Currently, we have (real) atomic operations for:
*
* - x86 or x86_64, using GNU compilers
* - x86 or x86_64, using Intel compilers
* - x86 or x86_64, using Pathscale compilers
* - Itanium, using GNU compilers
* - Itanium, using Intel compilers
* - Itanium, using HP compilers
* - PowerPC, using GNU compilers
* - PowerPC, using IBM AIX compilers
* - PowerPC, using IBM compilers >=7.0 under Linux or Mac OS X.
*/
typedef struct gmx_atomic
{
volatile int value; /*!< Volatile, to avoid compiler aliasing */
}
gmx_atomic_t;
/*! \brief Gromacs spinlock
*
* Spinlocks provide a faster synchronization than mutexes,
* although they consume CPU-cycles while waiting. They are implemented
* with atomic operations and inline assembly whenever possible, and
* otherwise we use a fallback implementation where a spinlock is identical
* to a mutex (this is one of the reasons why you have to initialize them).
*
* There are no guarantees whatsoever about fair scheduling or
* debugging if you make a mistake and unlock a variable somebody
* else has locked - performance is the primary goal of spinlocks.
*
*/
typedef struct gmx_spinlock
{
volatile unsigned int lock; /*!< Volatile, to avoid compiler aliasing */
}
gmx_spinlock_t;
/*! \brief Spinlock static initializer
*
* This is used for static spinlock initialization, and has the same
* properties as GMX_THREAD_MUTEX_INITIALIZER has for mutexes.
* This is only for inlining in the gmx_thread.h header file. Whether
* it is 0, 1, or something else when unlocked depends on the platform.
* Don't assume anything about it. It might even be a mutex when using the
* fallback implementation!
*/
#define GMX_SPINLOCK_INITIALIZER { 1 }
/*! \brief Return value of an atomic integer
*
* Also implements proper memory barriers when necessary.
* The actual implementation is system-dependent.
*
* \param a Atomic variable to read
* \return Integer value of the atomic variable
*/
#define gmx_atomic_read(a) ((a)->value)
/*! \brief Write value to an atomic integer
*
* Also implements proper memory barriers when necessary.
* The actual implementation is system-dependent.
*
* \param a Atomic variable
* \param i Integer to set the atomic variable to.
*/
#define gmx_atomic_set(a,i) (((a)->value) = (i))
/*! \brief Add integer to atomic variable
*
* Also implements proper memory barriers when necessary.
* The actual implementation is system-dependent.
*
* \param a atomic datatype to modify
* \param i integer to increment with. Use i<0 to subtract atomically.
*
* \return The new value (after summation).
*/
static inline int
gmx_atomic_add_return(gmx_atomic_t * a,
volatile int i)
{
int __i;
__i = i;
__asm__ __volatile__("lock ; xaddl %0, %1;"
:"=r"(i) :"m"(a->value), "0"(i));
return i + __i;
}
/*! \brief Add to variable, return the old value.
*
* This operation is quite useful for synchronization counters.
* By performing a fetchadd with N, a thread can e.g. reserve a chunk
* with the next N iterations, and the return value is the index
* of the first element to treat.
*
* Also implements proper memory barriers when necessary.
* The actual implementation is system-dependent.
*
* \param a atomic datatype to modify
* \param i integer to increment with. Use i<0 to subtract atomically.
*
* \return The value of the atomic variable before addition.
*/
static inline int
gmx_atomic_fetch_add(gmx_atomic_t * a,
volatile int i)
{
int __i;
__i = i;
__asm__ __volatile__("lock ; xaddl %0, %1;"
:"=r"(i) :"m"(a->value), "0"(i));
return i;
}
/*! \brief Atomic compare-exchange operation
*
* The \a old value is compared with the memory value in the atomic datatype.
* If the are identical, the atomic type is updated to the new value,
* and otherwise left unchanged.
*
* This is a very useful synchronization primitive: You can start by reading
* a value (without locking anything), perform some calculations, and then
* atomically try to update it in memory unless it has changed. If it has
* changed you will get an error return code - reread the new value
* an repeat the calculations in that case.
*
* \param a Atomic datatype ('memory' value)
* \param oldval Integer value read from the atomic type at an earlier point
* \param newval New value to write to the atomic type if it currently is
* identical to the old value.
*
* \return The value of the atomic memory variable in memory when this
* instruction was executed. This, if the operation succeeded the
* return value was identical to the \a old parameter, and if not
* it returns the updated value in memory so you can repeat your
* operations on it.
*
* \note The exchange occured if the return value is identical to \a old.
*/
static inline int
gmx_atomic_cmpxchg(gmx_atomic_t * a,
int oldval,
int newval)
{
volatile unsigned long prev;
__asm__ __volatile__("lock ; cmpxchgl %1,%2"
: "=a"(prev)
: "q"(newval), "m"(a->value), "0"(oldval)
: "memory");
return prev;
}
/*! \brief Initialize spinlock
*
* In theory you can call this from multiple threads, but remember
* that we don't check for errors. If the first thread proceeded to
* lock the spinlock after initialization, the second will happily
* overwrite the contents and unlock it without warning you.
*
* \param x Gromacs spinlock pointer.
*/
static inline void
gmx_spinlock_init(gmx_spinlock_t * x)
{
x->lock = 1;
}
/*! \brief Acquire spinlock
*
* This routine blocks until the spinlock is available, and
* the locks it again before returning.
*
* \param x Gromacs spinlock pointer
*/
static inline void
gmx_spinlock_lock(gmx_spinlock_t * x)
{
__asm__ __volatile__("\n1:\t"
"lock ; decb %0\n\t"
"jns 3f\n"
"2:\t"
"rep;nop\n\t"
"cmpb $0,%0\n\t"
"jle 2b\n\t"
"jmp 1b\n"
"3:\n\t"
:"=m" (x->lock) : : "memory");
}
/*! \brief Attempt to acquire spinlock
*
* This routine acquires the spinlock if possible, but if
* already locked it return an error code immediately.
*
* \param x Gromacs spinlock pointer
*
* \return 0 if the mutex was available so we could lock it,
* otherwise a non-zero integer (1) if the lock is busy.
*/
static inline int
gmx_spinlock_trylock(gmx_spinlock_t * x)
{
char old_value;
__asm__ __volatile__("xchgb %b0,%1"
:"=q" (old_value), "=m" (x->lock)
:"0" (0) : "memory");
return (old_value <= 0);
}
/*! \brief Release spinlock
*
* \param x Gromacs spinlock pointer
*
* Unlocks the spinlock, regardless if which thread locked it.
*/
static inline void
gmx_spinlock_unlock(gmx_spinlock_t * x)
{
char old_value = 1;
__asm__ __volatile__(
"xchgb %b0, %1"
:"=q" (old_value), "=m" (x->lock)
:"0" (old_value) : "memory"
);
}
/*! \brief Check if spinlock is locked
*
* This routine returns immediately with the lock status.
*
* \param x Gromacs spinlock pointer
*
* \return 1 if the spinlock is locked, 0 otherwise.
*/
static inline int
gmx_spinlock_islocked(gmx_spinlock_t * x)
{
return (*(volatile signed char *)(&(x)->lock) <= 0);
}
/*! \brief Wait for a spinlock to become available
*
* This routine blocks until the spinlock is unlocked,
* but in contrast to gmx_spinlock_lock() it returns without
* trying to lock the spinlock.
*
* \param x Gromacs spinlock pointer
*/
static inline void
gmx_spinlock_wait(gmx_spinlock_t * x)
{
do
{
gmx_atomic_memory_barrier();
}
while(gmx_spinlock_islocked(x));
}
#elif ( defined(__GNUC__) && (defined(__powerpc__) || defined(__ppc__)))
/* PowerPC using proper GCC inline assembly.
* Recent versions of xlC (>=7.0) _partially_ support this, but since it is
* not 100% compatible we provide a separate implementation for xlC in
* the next section.
*/
/* Compiler-dependent stuff: GCC memory barrier */
#define gmx_atomic_memory_barrier() __asm__ __volatile__("": : :"memory")
typedef struct gmx_atomic
{
volatile int value; /*!< Volatile, to avoid compiler aliasing */
}
gmx_atomic_t;
typedef struct gmx_spinlock
{
volatile unsigned int lock; /*!< Volatile, to avoid compiler aliasing */
}
gmx_spinlock_t;
#define GMX_SPINLOCK_INITIALIZER { 0 }
#define gmx_atomic_read(a) ((a)->value)
#define gmx_atomic_set(a,i) (((a)->value) = (i))
static inline int
gmx_atomic_add_return(gmx_atomic_t * a,
int i)
{
int t;
__asm__ __volatile__("1: lwarx %0,0,%2\n"
"\tadd %0,%1,%0\n"
"\tstwcx. %0,0,%2 \n"
"\tbne- 1b"
"\tisync\n"
: "=&r" (t)
: "r" (i), "r" (&a->value)
: "cc" , "memory");
return t;
}
static inline int
gmx_atomic_fetch_add(gmx_atomic_t * a,
int i)
{
int t;
__asm__ __volatile__("\teieio\n"
"1: lwarx %0,0,%2\n"
"\tadd %0,%1,%0\n"
"\tstwcx. %0,0,%2 \n"
"\tbne- 1b\n"
"\tisync\n"
: "=&r" (t)
: "r" (i), "r" (&a->value)
: "cc", "memory");
return (t - i);
}
static inline int
gmx_atomic_cmpxchg(gmx_atomic_t * a,
int oldval,
int newval)
{
int prev;
__asm__ __volatile__ ("1: lwarx %0,0,%2 \n"
"\tcmpw 0,%0,%3 \n"
"\tbne 2f \n"
"\tstwcx. %4,0,%2 \n"
"bne- 1b\n"
"\tsync\n"
"2:\n"
: "=&r" (prev), "=m" (a->value)
: "r" (&a->value), "r" (oldval), "r" (newval), "m" (a->value)
: "cc", "memory");
return prev;
}
static inline void
gmx_spinlock_init(gmx_spinlock_t *x)
{
x->lock = 0;
}
static inline void
gmx_spinlock_lock(gmx_spinlock_t * x)
{
unsigned int tmp;
__asm__ __volatile__("\tb 1f\n"
"2: lwzx %0,0,%1\n"
"\tcmpwi 0,%0,0\n"
"\tbne+ 2b\n"
"1: lwarx %0,0,%1\n"
"\tcmpwi 0,%0,0\n"
"\tbne- 2b\n"
"\tstwcx. %2,0,%1\n"
"bne- 2b\n"
"\tisync\n"
: "=&r"(tmp)
: "r"(&x->lock), "r"(1)
: "cr0", "memory");
}
static inline int
gmx_spinlock_trylock(gmx_spinlock_t * x)
{
unsigned int old, t;
unsigned int mask = 1;
volatile unsigned int *p = &x->lock;
__asm__ __volatile__("\teieio\n"
"1: lwarx %0,0,%4 \n"
"\tor %1,%0,%3 \n"
"\tstwcx. %1,0,%4 \n"
"\tbne 1b\n"
"\tsync\n"
: "=&r" (old), "=&r" (t), "=m" (*p)
: "r" (mask), "r" (p), "m" (*p)
: "cc", "memory");
return ((old & mask) != 0);
}
static inline void
gmx_spinlock_unlock(gmx_spinlock_t * x)
{
__asm__ __volatile__("\teieio\n": : :"memory");
x->lock = 0;
}
static inline int
gmx_spinlock_islocked(gmx_spinlock_t * x)
{
return ( x->lock != 0);
}
static inline void
gmx_spinlock_wait(gmx_spinlock_t *x)
{
do
{
gmx_atomic_memory_barrier();
}
while(gmx_spinlock_islocked(x));
}
#elif ( (defined(__IBM_GCC_ASM) || defined(__IBM_STDCPP_ASM)) && \
(defined(__powerpc__) || defined(__ppc__)))
/* PowerPC using xlC inline assembly.
* Recent versions of xlC (>=7.0) _partially_ support GCC inline assembly
* if you use the option -qasm=gcc but we have had to hack things a bit, in
* particular when it comes to clobbered variables. Since this implementation
* _could_ be buggy, we have separated it from the known-to-be-working gcc
* one above.
*/
/* memory barrier - no idea how to create one with xlc! */
#define gmx_atomic_memory_barrier()
typedef struct gmx_atomic
{
volatile int value; /*!< Volatile, to avoid compiler aliasing */
}
gmx_atomic_t;
typedef struct gmx_spinlock
{
volatile unsigned int lock; /*!< Volatile, to avoid compiler aliasing */
}
gmx_spinlock_t;
#define GMX_SPINLOCK_INITIALIZER { 0 }
#define gmx_atomic_read(a) ((a)->value)
#define gmx_atomic_set(a,i) (((a)->value) = (i))
static inline int
gmx_atomic_add_return(gmx_atomic_t * a,
int i)
{
int t;
__asm__ __volatile__("1: lwarx %0,0,%2 \n"
"\t add %0,%1,%0 \n"
"\t stwcx. %0,0,%2 \n"
"\t bne- 1b \n"
"\t isync \n"
: "=&r" (t)
: "r" (i), "r" (&a->value) );
return t;
}
static inline int
gmx_atomic_fetch_add(gmx_atomic_t * a,
int i)
{
int t;
__asm__ __volatile__("\t eieio\n"
"1: lwarx %0,0,%2 \n"
"\t add %0,%1,%0 \n"
"\t stwcx. %0,0,%2 \n"
"\t bne- 1b \n"
"\t isync \n"
: "=&r" (t)
: "r" (i), "r" (&a->value));
return (t - i);
}
static inline int
gmx_atomic_cmpxchg(gmx_atomic_t * a,
int oldval,
int newval)
{
int prev;
__asm__ __volatile__ ("1: lwarx %0,0,%2 \n"
"\t cmpw 0,%0,%3 \n"
"\t bne 2f \n"
"\t stwcx. %4,0,%2 \n"
"\t bne- 1b \n"
"\t sync \n"
"2: \n"
: "=&r" (prev), "=m" (a->value)
: "r" (&a->value), "r" (oldval), "r" (newval), "m" (a->value));
return prev;
}
static inline void
gmx_spinlock_init(gmx_spinlock_t *x)
{
x->lock = 0;
}
static inline void
gmx_spinlock_lock(gmx_spinlock_t * x)
{
unsigned int tmp;
__asm__ __volatile__("\t b 1f \n"
"2: lwzx %0,0,%1 \n"
"\t cmpwi 0,%0,0 \n"
"\t bne+ 2b \n"
"1: lwarx %0,0,%1 \n"
"\t cmpwi 0,%0,0 \n"
"\t bne- 2b \n"
"\t stwcx. %2,0,%1 \n"
"\t bne- 2b \n"
"\t isync\n"
: "=&r"(tmp)
: "r"(&x->lock), "r"(1));
}
static inline int
gmx_spinlock_trylock(gmx_spinlock_t * x)
{
unsigned int old, t;
unsigned int mask = 1;
volatile unsigned int *p = &x->lock;
__asm__ __volatile__("\t eieio\n"
"1: lwarx %0,0,%4 \n"
"\t or %1,%0,%3 \n"
"\t stwcx. %1,0,%4 \n"
"\t bne 1b \n"
"\t sync \n"
: "=&r" (old), "=&r" (t), "=m" (*p)
: "r" (mask), "r" (p), "m" (*p));
return ((old & mask) != 0);
}
static inline void
gmx_spinlock_unlock(gmx_spinlock_t * x)
{
__asm__ __volatile__("\t eieio \n");
x->lock = 0;
}
static inline void
gmx_spinlock_islocked(gmx_spinlock_t * x)
{
return ( x->lock != 0);
}
static inline void
gmx_spinlock_wait(gmx_spinlock_t * x)
{
do
{
gmx_atomic_memory_barrier();
}
while(gmx_spinlock_islocked(x));
}
#elif (defined(__ia64__) && (defined(__GNUC__) || defined(__INTEL_COMPILER)))
/* ia64 with GCC or Intel compilers. Since we need to define everything through
* cmpxchg and fetchadd on ia64, we merge the different compilers and only provide
* different implementations for that single function.
* Documentation? Check the gcc/x86 section.
*/
typedef struct gmx_atomic
{
volatile int value; /*!< Volatile, to avoid compiler aliasing */
}
gmx_atomic_t;
typedef struct gmx_spinlock
{
volatile unsigned int lock; /*!< Volatile, to avoid compiler aliasing */
}
gmx_spinlock_t;
#define GMX_SPINLOCK_INITIALIZER { 0 }
#define gmx_atomic_read(a) ((a)->value)
#define gmx_atomic_set(a,i) (((a)->value) = (i))
/* Compiler thingies */
#ifdef __INTEL_COMPILER
void __memory_barrier(void);
int _InterlockedCompareExchange(volatile int *dest, int xchg, int comp);
unsigned __int64 __fetchadd4_rel(unsigned int *addend, const int increment);
/* ia64 memory barrier */
# define gmx_atomic_memory_barrier() __memory_barrier()
/* ia64 cmpxchg */
# define gmx_atomic_cmpxchg(a, oldval, newval) _InterlockedCompareExchange(&a->value,newval,oldval)
/* ia64 fetchadd, but it only works with increments +/- 1,4,8,16 */
# define gmx_ia64_fetchadd(a, inc) __fetchadd4_rel(a, inc)
#elif defined __GNUC__
/* ia64 memory barrier */
# define gmx_atomic_memory_barrier() asm volatile ("":::"memory")
/* ia64 cmpxchg */
static inline int
gmx_atomic_cmpxchg(gmx_atomic_t * a,
int oldval,
int newval)
{
volatile int res;
asm volatile ("mov ar.ccv=%0;;" :: "rO"(oldval));
asm volatile ("cmpxchg4.acq %0=[%1],%2,ar.ccv":
"=r"(res) : "r"(&a->value), "r"(newval) : "memory");
return res;
}
/* fetchadd, but on ia64 it only works with increments +/- 1,4,8,16 */
#define gmx_ia64_fetchadd(a, inc) \
({ unsigned long res; \
asm volatile ("fetchadd4.rel %0=[%1],%2" \
: "=r"(res) : "r"(a), "i" (inc) : "memory"); \
res; \
})
#else /* Unknown compiler */
# error Unknown ia64 compiler (not GCC or ICC) - modify gmx_thread.h!
#endif
static inline int
gmx_atomic_add_return(gmx_atomic_t * a,
volatile int i)
{
volatile int oldval,newval;
volatile int __i = i;
/* Use fetchadd if, and only if, the increment value can be determined
* at compile time (otherwise this check is optimized away) and it is
* a value supported by fetchadd (1,4,8,16,-1,-4,-8,-16).
*/
if (__builtin_constant_p(i) &&
( (__i == 1) || (__i == 4) || (__i == 8) || (__i == 16) ||
(__i == -1) || (__i == -4) || (__i == -8) || (__i == -16) ) )
{
oldval = gmx_ia64_fetchadd(a,__i);
newval = oldval + i;
}
else
{
/* Use compare-exchange addition that works with any value */
do
{
oldval = gmx_atomic_read(a);
newval = oldval + i;
}
while(gmx_atomic_cmpxchg(a,oldval,newval) != oldval);
}
return newval;
}
static inline int
gmx_atomic_fetch_add(gmx_atomic_t * a,
volatile int i)
{
volatile int oldval,newval;
volatile int __i = i;
/* Use ia64 fetchadd if, and only if, the increment value can be determined
* at compile time (otherwise this check is optimized away) and it is
* a value supported by fetchadd (1,4,8,16,-1,-4,-8,-16).
*/
if (__builtin_constant_p(i) &&
( (__i == 1) || (__i == 4) || (__i == 8) || (__i == 16) ||
(__i == -1) || (__i == -4) || (__i == -8) || (__i == -16) ) )
{
oldval = gmx_ia64_fetchadd(a,__i);
newval = oldval + i;
}
else
{
/* Use compare-exchange addition that works with any value */
do
{
oldval = gmx_atomic_read(a);
newval = oldval + i;
}
while(gmx_atomic_cmpxchg(a,oldval,newval) != oldval);
}
return oldval;
}
static inline void
gmx_spinlock_init(gmx_spinlock_t *x)
{
x->lock = 0;
}
static inline void
gmx_spinlock_lock(gmx_spinlock_t * x)
{
gmx_atomic_t *a = (gmx_atomic_t *) x;
unsigned long value;
value = gmx_atomic_cmpxchg(a, 0, 1);
if (value)
{
do
{
while (a->value != 0)
{
gmx_atomic_memory_barrier();
}
value = gmx_atomic_cmpxchg(a, 0, 1);
}
while (value);
}
}
static inline int
gmx_spinlock_trylock(gmx_spinlock_t * x)
{
return (gmx_atomic_cmpxchg((gmx_atomic_t *)x, 0, 1) != 0);
}
static inline void
gmx_spinlock_unlock(gmx_spinlock_t * x)
{
do
{
gmx_atomic_memory_barrier();
x->lock = 0;
}
while (0);
}
static inline int
gmx_spinlock_islocked(gmx_spinlock_t * x)
{
return (x->lock != 0);
}
static inline void
gmx_spinlock_wait(gmx_spinlock_t * x)
{
do
{
gmx_atomic_memory_barrier();
}
while(gmx_spinlock_islocked(x));
}
#undef gmx_ia64_fetchadd
#elif (defined(__hpux) || defined(__HP_cc)) && defined(__ia64)
/* HP compiler on ia64 */
#include <machine/sys/inline.h>
#define gmx_atomic_memory_barrier() _Asm_mf()
#define gmx_hpia64_fetchadd(a, i) \
_Asm_fetchadd((_Asm_fasz)_FASZ_W,(_Asm_sem)_SEM_REL, \
(UInt32*)a,(unsigned int) i, \
(_Asm_ldhint)LDHINT_NONE)
typedef struct gmx_atomic
{
volatile int value; /*!< Volatile, to avoid compiler aliasing */
}
gmx_atomic_t;
typedef struct gmx_spinlock
{
volatile unsigned int lock; /*!< Volatile, to avoid compiler aliasing */
}
gmx_spinlock_t;
static inline int
gmx_atomic_cmpxchg(gmx_atomic_t * a,
int oldval,
int newval)
{
int ret;
_Asm_mov_to_ar((_Asm_app_reg)_AREG_CCV,(Uint32)oldval,
(_Asm_fence)(_UP_CALL_FENCE | _UP_SYS_FENCE |
_DOWN_CALL_FENCE | _DOWN_SYS_FENCE));
ret = _Asm_cmpxchg((_Asm_sz)SZ_W,(_Asm_sem)_SEM_ACQ,(Uint32*)a,
(Uint32)newval,(_Asm_ldhint)_LDHINT_NONE);
return ret;
}
#define GMX_SPINLOCK_INITIALIZER { 0 }
#define gmx_atomic_read(a) ((a)->value)
#define gmx_atomic_set(a,i) (((a)->value) = (i))
static inline void
gmx_atomic_add_return(gmx_atomic_t * a,
int i)
{
int old,new;
int __i = i;
/* On HP-UX we don't know any macro to determine whether the increment
* is known at compile time, but hopefully the call uses something simple
* like a constant, and then the optimizer should be able to do the job.
*/
if ( (__i == 1) || (__i == 4) || (__i == 8) || (__i == 16) ||
(__i == -1) || (__i == -4) || (__i == -8) || (__i == -16) )
{
oldval = gmx_hpia64_fetchadd(a,__i);
newval = oldval + i;
}
else
{
/* Use compare-exchange addition that works with any value */
do
{
oldval = gmx_atomic_read(a);
newval = oldval + i;
}
while(gmx_atomic_cmpxchg(a,oldval,newval) != oldval);
}
return newval;
}
static inline int
gmx_atomic_fetch_add(gmx_atomic_t * a,
int i)
{
int oldval,newval;
int __i = i;
/* On HP-UX we don't know any macro to determine whether the increment
* is known at compile time, but hopefully the call uses something simple
* like a constant, and then the optimizer should be able to do the job.
*/
if ( (__i == 1) || (__i == 4) || (__i == 8) || (__i == 16) ||
(__i == -1) || (__i == -4) || (__i == -8) || (__i == -16) )
{
oldval = gmx_hpia64_fetchadd(a,__i);
newval = oldval + i;
}
else
{
/* Use compare-exchange addition that works with any value */
do
{
oldval = gmx_atomic_read(a);
newval = oldval + i;
}
while(gmx_atomic_cmpxchg(a,oldval,newval) != oldval);
}
return oldval;
}
static inline void
gmx_spinlock_init(gmx_spinlock_t *x)
{
x->lock = 0;
}
static inline void
gmx_spinlock_trylock(gmx_spinlock_t *x)
{
int rc;
rc = _Asm_xchg((_Asm_sz)_SZ_W, (unsigned int *)x, 1
(_Asm_ldhit)_LDHINT_NONE);
return ( (rc>0) ? 1 : 0);
}
static inline void
gmx_spinlock_lock(gmx_spinlock_t *x)
{
int status = 1;
do
{
if( *((unsigned int *)x->lock) == 0 )
{
status = gmx_spinlock_trylock(x);
}
} while( status != 0);
}
static inline void
gmx_spinlock_unlock(gmx_spinlock_t * x)
{
_Asm_fetchadd((_Asm_fasz)_SZ_W,(_Asm_sem)_SEM_REL,
(unsigned int *)x,-1,(_Asm_ldhint)_LDHINT_NONE);
}
static inline void
gmx_spinlock_islocked(gmx_spinlock_t * x)
{
return ( x->lock != 0 );
}
static inline void
gmx_spinlock_wait(gmx_spinlock_t * x)
{
do
{
gmx_atomic_memory_barrier();
}
while(gmx_spinlock_islocked(x));
}
#undef gmx_hpia64_fetchadd
#elif (defined(_MSC_VER) && (_MSC_VER >= 1200))
/* Microsoft Visual C on x86, define taken from FFTW who got it from Morten Nissov */
#include <windows.h>
#define gmx_atomic_memory_barrier()
typedef struct gmx_atomic
{
LONG volatile value; /*!< Volatile, to avoid compiler aliasing */
}
gmx_atomic_t;
typedef struct gmx_spinlock
{
LONG volatile lock; /*!< Volatile, to avoid compiler aliasing */
}
gmx_spinlock_t;
#define GMX_SPINLOCK_INITIALIZER { 0 }
#define gmx_atomic_read(a) ((a)->value)
#define gmx_atomic_set(a,i) (((a)->value) = (i))
#define gmx_atomic_fetch_add(a, i) \
InterlockedExchangeAdd((LONG volatile *)a, (LONG) i)
#define gmx_atomic_add_return(a, i) \
( i + InterlockedExchangeAdd((LONG volatile *)a, (LONG) i) )
#define gmx_atomic_cmpxchg(a, oldval, newval) \
InterlockedCompareExchange((LONG volatile *)a, (LONG) newval, (LONG) oldval)
# define gmx_spinlock_lock(x) \
while((InterlockedCompareExchange((LONG volatile *)&x, 1, 0))!=0)
#define gmx_spinlock_trylock(x) \
InterlockedCompareExchange((LONG volatile *)&x, 1, 0)
static inline void
gmx_spinlock_unlock(gmx_spinlock_t * x)
{
x->lock = 0;
}
static inline int
gmx_spinlock_islocked(gmx_spinlock_t * x)
{
return (*(volatile signed char *)(&(x)->lock) != 0);
}
static inline void
gmx_spinlock_wait(gmx_spinlock_t * x)
{
while(gmx_spinlock_islocked(x))
{
Sleep(0);
}
}
#elif defined(__xlC__) && defined (_AIX)
/* IBM xlC compiler on AIX */
#include <sys/atomic_op.h>
#define gmx_atomic_memory_barrier()
typedef struct gmx_atomic
{
volatile int value; /*!< Volatile, to avoid compiler aliasing */
}
gmx_atomic_t;
typedef struct gmx_spinlock
{
volatile unsigned int lock; /*!< Volatile, to avoid compiler aliasing */
}
gmx_spinlock_t;
static inline int
gmx_atomic_cmpxchg(gmx_atomic_t * a,
int oldval,
int newval)
{
int t;
if(__check_lock((atomic_p)&a->value, oldval, newval))
{
/* Not successful - value had changed in memory. Reload value. */
t = a->value;
}
else
{
/* replacement suceeded */
t = oldval;
}
return t;
}
static inline void
gmx_atomic_add_return(gmx_atomic_t * a,
int i)
{
int oldval,newval;
do
{
oldval = gmx_atomic_read(a);
newval = oldval + i;
}
while(__check_lock((atomic_p)&a->value, oldval, newval));
return newval;
}
static inline void
gmx_atomic_fetch_add(gmx_atomic_t * a,
int i)
{
int oldval,newval;
do
{
oldval = gmx_atomic_read(a);
newval = oldval + i;
}
while(__check_lock((atomic_p)&a->value, oldval, newval));
return oldval;
}
static inline void
gmx_spinlock_init(gmx_spinlock_t * x)
{
__clear_lock((atomic_p)x,0);
}
static inline void
gmx_spinlock_lock(gmx_spinlock_t * x)
{
do
{
;
}
while(__check_lock((atomic_p)x, 0, 1));
}
static inline void
gmx_spinlock_trylock(gmx_spinlock_t * x)
{
/* Return 0 if we got the lock */
return (__check_lock((atomic_p)x, 0, 1) != 0)
}
static inline void
gmx_spinlock_unlock(gmx_spinlock_t * x)
{
__clear_lock((atomic_p)x,0);
}
static inline void
gmx_spinlock_islocked(gmx_spinlock_t * x)
{
return (*((atomic_p)x) != 0);
}
static inline void
gmx_spinlock_wait(gmx_spinlock_t * x)
{
while(gmx_spinlock_islocked(x)) { ; }
}
#else
/* No atomic operations, use mutex fallback. Documentation is in x86 section */
#define gmx_atomic_memory_barrier()
/* System mutex used for locking to guarantee atomicity */
static pthread_mutex_t
gmx_atomic_mutex = PTHREAD_MUTEX_INITIALIZER;
typedef struct gmx_atomic
{
int value;
}
gmx_atomic_t;
#define gmx_spinlock_t pthread_mutex_t
# define GMX_SPINLOCK_INITIALIZER PTHREAD_MUTEX_INITIALIZER
/* Since mutexes guarantee memory barriers this works fine */
#define gmx_atomic_read(a) ((a)->value)
static inline void
gmx_atomic_set(gmx_atomic_t * a,
int i)
{
/* Mutexes here are necessary to guarantee memory visibility */
pthread_mutex_lock(&gmx_atomic_mutex);
a->value = i;
pthread_mutex_unlock(&gmx_atomic_mutex);
}
static inline int
gmx_atomic_add_return(gmx_atomic_t * a,
int i)
{
int t;
pthread_mutex_lock(&gmx_atomic_mutex);
t = a->value + i;
a->value = t;
pthread_mutex_unlock(&gmx_atomic_mutex);
return t;
}
static inline int
gmx_atomic_fetch_add(gmx_atomic_t * a,
int i)
{
int old_value;
pthread_mutex_lock(&gmx_atomic_mutex);
old_value = a->value;
a->value = old_value + i;
pthread_mutex_unlock(&gmx_atomic_mutex);
return old_value;
}
static inline int
gmx_atomic_cmpxchg(gmx_atomic_t * a,
int oldv,
int newv)
{
int t;
pthread_mutex_lock(&gmx_atomic_mutex);
t = a->value;
if (t == oldv)
{
a->value = newv;
}
pthread_mutex_unlock(&gmx_atomic_mutex);
return t;
}
#define gmx_spinlock_init(lock) pthread_mutex_init(lock)
#define gmx_spinlock_lock(lock) pthread_mutex_lock(lock)
#define gmx_spinlock_trylock(lock) pthread_mutex_trylock(lock)
#define gmx_spinlock_unlock(lock) pthread_mutex_unlock(lock)
static inline int
gmx_spinlock_islocked(gmx_spinlock_t * x)
{
int rc;
if(gmx_spinlock_trylock(x) != 0)
{
/* It was locked */
return 1;
}
else
{
/* We just locked it */
gmx_spinlock_unlock(x);
return 0;
}
}
static inline void
gmx_spinlock_wait(gmx_spinlock_t * x)
{
int rc;
gmx_spinlock_lock(x);
/* Got the lock now, so the waiting is over */
gmx_spinlock_unlock(x);
}
#endif
/*! \brief Spinlock-based barrier type
*
* This barrier has the same functionality as the standard
* gmx_thread_barrier_t, but since it is based on spinlocks
* it provides faster synchronization at the cost of busy-waiting.
*
* Variables of this type should be initialized by calling
* gmx_spinlock_barrier_init() to set the number of threads
* that should be synchronized.
*/
typedef struct gmx_spinlock_barrier
{
gmx_atomic_t count; /*!< Number of threads remaining */
int threshold; /*!< Total number of threads */
volatile int cycle; /*!< Current cycle (alternating 0/1) */
}
gmx_spinlock_barrier_t;
/*! \brief Initialize spinlock-based barrier
*
* \param barrier Pointer to _spinlock_ barrier. Note that this is not
* the same datatype as the full, thread based, barrier.
* \param count Number of threads to synchronize. All threads
* will be released after \a count calls to
* gmx_spinlock_barrier_wait().
*/
static inline void
gmx_spinlock_barrier_init(gmx_spinlock_barrier_t * barrier,
int count)
{
barrier->threshold = count;
barrier->cycle = 0;
gmx_atomic_set(&(barrier->count),count);
}
/*! \brief Perform busy-waiting barrier synchronization
*
* This routine blocks until it has been called N times,
* where N is the count value the barrier was initialized with.
* After N total calls all threads return. The barrier automatically
* cycles, and thus requires another N calls to unblock another time.
*
* Note that spinlock-based barriers are completely different from
* standard ones (using mutexes and condition variables), only the
* functionality and names are similar.
*
* \param barrier Pointer to previously create barrier.
*
* \return The last thread returns -1, all the others 0.
*/
static inline int
gmx_spinlock_barrier_wait(gmx_spinlock_barrier_t * barrier)
{
int cycle;
int status;
/* We don't need to lock or use atomic ops here, since the cycle index
* cannot change until after the last thread has performed the check
* further down. Further, they cannot reach this point in the next
* barrier iteration until all of them have been released, and that
* happens after the cycle value has been updated.
*
* No synchronization == fast synchronization.
*/
cycle = barrier->cycle;
/* Decrement the count atomically and check if it is zero.
* This will only be true for the last thread calling us.
*/
if( gmx_atomic_add_return( &(barrier->count), -1 ) == 0)
{
gmx_atomic_set(&(barrier->count), barrier->threshold);
barrier->cycle = !barrier->cycle;
status = -1;
}
else
{
/* Wait until the last thread changes the cycle index.
* We are both using a memory barrier, and explicit
* volatile pointer cast to make sure the compiler
* doesn't try to be smart and cache the contents.
*/
do
{
gmx_atomic_memory_barrier();
}
while( *(volatile int *)(&(barrier->cycle)) == cycle);
status = 0;
}
return status;
}
#ifdef __cplusplus
}
#endif
#endif /* _GMX_ATOMIC_H_ */
......@@ -20,6 +20,7 @@ FOREACH(TEST_PROG ${TEST_PROGS})
# Link with shared library
ADD_EXECUTABLE(${TEST_ROOT} ${TEST_PROG})
TARGET_LINK_LIBRARIES(${TEST_ROOT} ${SHARED_TARGET})
SET_TARGET_PROPERTIES(${TEST_ROOT} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS}")
ADD_TEST(${TEST_ROOT} ${EXECUTABLE_OUTPUT_PATH}/${TEST_ROOT} single)
ENDFOREACH(TEST_PROG ${TEST_PROGS})
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008-2013 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
/**
* This tests the CPU implementation of GBSAOBCForce.
*/
#include "openmm/internal/AssertionUtilities.h"
#include "openmm/Context.h"
#include "CpuPlatform.h"
#include "openmm/GBSAOBCForce.h"
#include "openmm/System.h"
#include "openmm/LangevinIntegrator.h"
#include "openmm/NonbondedForce.h"
#include "SimTKOpenMMRealType.h"
#include "sfmt/SFMT.h"
#include <iostream>
#include <vector>
using namespace OpenMM;
using namespace std;
const double TOL = 1e-5;
void testSingleParticle() {
CpuPlatform platform;
System system;
system.addParticle(2.0);
LangevinIntegrator integrator(0, 0.1, 0.01);
GBSAOBCForce* forceField = new GBSAOBCForce();
forceField->addParticle(0.5, 0.15, 1);
system.addForce(forceField);
Context context(system, integrator, platform);
vector<Vec3> positions(1);
positions[0] = Vec3(0, 0, 0);
context.setPositions(positions);
State state = context.getState(State::Energy);
double bornRadius = 0.15-0.009; // dielectric offset
double eps0 = EPSILON0;
double bornEnergy = (-0.5*0.5/(8*PI_M*eps0))*(1.0/forceField->getSoluteDielectric()-1.0/forceField->getSolventDielectric())/bornRadius;
double extendedRadius = 0.15+0.14; // probe radius
double nonpolarEnergy = CAL2JOULE*PI_M*0.0216*(10*extendedRadius)*(10*extendedRadius)*std::pow(0.15/bornRadius, 6.0); // Where did this formula come from? Just copied it from CpuImplicitSolvent.cpp
ASSERT_EQUAL_TOL((bornEnergy+nonpolarEnergy), state.getPotentialEnergy(), 0.01);
// Change the parameters and see if it is still correct.
forceField->setParticleParameters(0, 0.4, 0.25, 1);
forceField->updateParametersInContext(context);
state = context.getState(State::Energy);
bornRadius = 0.25-0.009; // dielectric offset
bornEnergy = (-0.4*0.4/(8*PI_M*eps0))*(1.0/forceField->getSoluteDielectric()-1.0/forceField->getSolventDielectric())/bornRadius;
extendedRadius = bornRadius+0.14;
nonpolarEnergy = CAL2JOULE*PI_M*0.0216*(10*extendedRadius)*(10*extendedRadius)*std::pow(0.25/bornRadius, 6.0);
ASSERT_EQUAL_TOL((bornEnergy+nonpolarEnergy), state.getPotentialEnergy(), 0.01);
}
void testCutoffAndPeriodic() {
CpuPlatform platform;
System system;
system.addParticle(1.0);
system.addParticle(1.0);
LangevinIntegrator integrator(0, 0.1, 0.01);
GBSAOBCForce* gbsa = new GBSAOBCForce();
NonbondedForce* nonbonded = new NonbondedForce();
gbsa->addParticle(-1, 0.15, 1);
nonbonded->addParticle(-1, 1, 0);
gbsa->addParticle(1, 0.15, 1);
nonbonded->addParticle(1, 1, 0);
const double cutoffDistance = 3.0;
const double boxSize = 10.0;
nonbonded->setCutoffDistance(cutoffDistance);
gbsa->setCutoffDistance(cutoffDistance);
system.setDefaultPeriodicBoxVectors(Vec3(boxSize, 0, 0), Vec3(0, boxSize, 0), Vec3(0, 0, boxSize));
system.addForce(gbsa);
system.addForce(nonbonded);
vector<Vec3> positions(2);
positions[0] = Vec3(0, 0, 0);
positions[1] = Vec3(2, 0, 0);
// Calculate the forces for both cutoff and periodic with two different atom positions.
nonbonded->setNonbondedMethod(NonbondedForce::CutoffNonPeriodic);
gbsa->setNonbondedMethod(GBSAOBCForce::CutoffNonPeriodic);
Context context(system, integrator, platform);
context.setPositions(positions);
State state1 = context.getState(State::Forces);
nonbonded->setNonbondedMethod(NonbondedForce::CutoffPeriodic);
gbsa->setNonbondedMethod(GBSAOBCForce::CutoffPeriodic);
context.reinitialize();
context.setPositions(positions);
State state2 = context.getState(State::Forces);
positions[1][0]+= boxSize;
nonbonded->setNonbondedMethod(NonbondedForce::CutoffNonPeriodic);
gbsa->setNonbondedMethod(GBSAOBCForce::CutoffNonPeriodic);
context.reinitialize();
context.setPositions(positions);
State state3 = context.getState(State::Forces);
nonbonded->setNonbondedMethod(NonbondedForce::CutoffPeriodic);
gbsa->setNonbondedMethod(GBSAOBCForce::CutoffPeriodic);
context.reinitialize();
context.setPositions(positions);
State state4 = context.getState(State::Forces);
// All forces should be identical, exception state3 which should be zero.
ASSERT_EQUAL_VEC(state1.getForces()[0], state2.getForces()[0], 0.01);
ASSERT_EQUAL_VEC(state1.getForces()[1], state2.getForces()[1], 0.01);
ASSERT_EQUAL_VEC(state1.getForces()[0], state4.getForces()[0], 0.01);
ASSERT_EQUAL_VEC(state1.getForces()[1], state4.getForces()[1], 0.01);
ASSERT_EQUAL_VEC(state3.getForces()[0], Vec3(0, 0, 0), 0.01);
ASSERT_EQUAL_VEC(state3.getForces()[1], Vec3(0, 0, 0), 0.01);
}
void testForce(int numParticles, NonbondedForce::NonbondedMethod method, GBSAOBCForce::NonbondedMethod method2) {
CpuPlatform platform;
ReferencePlatform reference;
System system;
GBSAOBCForce* gbsa = new GBSAOBCForce();
NonbondedForce* nonbonded = new NonbondedForce();
for (int i = 0; i < numParticles; ++i) {
system.addParticle(1.0);
double charge = i%2 == 0 ? -1 : 1;
gbsa->addParticle(charge, 0.15, 1);
nonbonded->addParticle(charge, 1, 0);
}
nonbonded->setNonbondedMethod(method);
gbsa->setNonbondedMethod(method2);
nonbonded->setCutoffDistance(3.0);
gbsa->setCutoffDistance(3.0);
int grid = (int) floor(0.5+pow(numParticles, 1.0/3.0));
if (method == NonbondedForce::CutoffPeriodic) {
double boxSize = (grid+1)*1.1;
system.setDefaultPeriodicBoxVectors(Vec3(boxSize, 0, 0), Vec3(0, boxSize, 0), Vec3(0, 0, boxSize));
}
system.addForce(gbsa);
system.addForce(nonbonded);
LangevinIntegrator integrator1(0, 0.1, 0.01);
LangevinIntegrator integrator2(0, 0.1, 0.01);
Context context(system, integrator1, platform);
Context refContext(system, integrator2, reference);
// Set random (but uniformly distributed) positions for all the particles.
vector<Vec3> positions(numParticles);
OpenMM_SFMT::SFMT sfmt;
init_gen_rand(0, sfmt);
for (int i = 0; i < grid; i++)
for (int j = 0; j < grid; j++)
for (int k = 0; k < grid; k++)
positions[i*grid*grid+j*grid+k] = Vec3(i*1.1, j*1.1, k*1.1);
for (int i = 0; i < numParticles; ++i)
positions[i] = positions[i] + Vec3(0.5*genrand_real2(sfmt), 0.5*genrand_real2(sfmt), 0.5*genrand_real2(sfmt));
context.setPositions(positions);
refContext.setPositions(positions);
State state = context.getState(State::Forces | State::Energy);
State refState = refContext.getState(State::Forces | State::Energy);
// Make sure the CPU and Reference platforms agree.
double norm = 0.0;
double diff = 0.0;
for (int i = 0; i < numParticles; ++i) {
Vec3 f = state.getForces()[i];
norm += f[0]*f[0] + f[1]*f[1] + f[2]*f[2];
Vec3 delta = f-refState.getForces()[i];
diff += delta[0]*delta[0] + delta[1]*delta[1] + delta[2]*delta[2];
}
norm = std::sqrt(norm);
diff = std::sqrt(diff);
ASSERT_EQUAL_TOL(0.0, diff, 0.001*norm);
ASSERT_EQUAL_TOL(state.getPotentialEnergy(), refState.getPotentialEnergy(), 1e-3);
// Take a small step in the direction of the energy gradient and see whether the potential energy changes by the expected amount.
// (This doesn't work with cutoffs, since the energy changes discontinuously at the cutoff distance.)
if (method == NonbondedForce::NoCutoff)
{
const double delta = 0.3;
double step = 0.5*delta/norm;
vector<Vec3> positions2(numParticles), positions3(numParticles);
for (int i = 0; i < numParticles; ++i) {
Vec3 p = positions[i];
Vec3 f = state.getForces()[i];
positions2[i] = Vec3(p[0]-f[0]*step, p[1]-f[1]*step, p[2]-f[2]*step);
positions3[i] = Vec3(p[0]+f[0]*step, p[1]+f[1]*step, p[2]+f[2]*step);
}
context.setPositions(positions2);
State state2 = context.getState(State::Energy);
context.setPositions(positions3);
State state3 = context.getState(State::Energy);
ASSERT_EQUAL_TOL(norm, (state2.getPotentialEnergy()-state3.getPotentialEnergy())/delta, 1e-2)
}
}
int main() {
try {
if (!CpuPlatform::isProcessorSupported()) {
cout << "CPU is not supported. Exiting." << endl;
return 0;
}
testSingleParticle();
testCutoffAndPeriodic();
for (int i = 5; i < 11; i++) {
testForce(i*i*i, NonbondedForce::NoCutoff, GBSAOBCForce::NoCutoff);
testForce(i*i*i, NonbondedForce::CutoffNonPeriodic, GBSAOBCForce::CutoffNonPeriodic);
testForce(i*i*i, NonbondedForce::CutoffPeriodic, GBSAOBCForce::CutoffPeriodic);
}
}
catch(const exception& e) {
cout << "exception: " << e.what() << endl;
return 1;
}
cout << "Done" << endl;
return 0;
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008-2013 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
/**
* This tests the reference implementation of LangevinIntegrator.
*/
#include "openmm/internal/AssertionUtilities.h"
#include "openmm/Context.h"
#include "CpuPlatform.h"
#include "openmm/HarmonicBondForce.h"
#include "openmm/NonbondedForce.h"
#include "openmm/System.h"
#include "openmm/LangevinIntegrator.h"
#include "SimTKOpenMMRealType.h"
#include "sfmt/SFMT.h"
#include <iostream>
#include <vector>
using namespace OpenMM;
using namespace std;
const double TOL = 1e-5;
void testSingleBond() {
CpuPlatform platform;
System system;
system.addParticle(2.0);
system.addParticle(2.0);
LangevinIntegrator integrator(0, 0.1, 0.01);
HarmonicBondForce* forceField = new HarmonicBondForce();
forceField->addBond(0, 1, 1.5, 1);
system.addForce(forceField);
Context context(system, integrator, platform);
vector<Vec3> positions(2);
positions[0] = Vec3(-1, 0, 0);
positions[1] = Vec3(1, 0, 0);
context.setPositions(positions);
// This is simply a damped harmonic oscillator, so compare it to the analytical solution.
double freq = std::sqrt(1-0.05*0.05);
for (int i = 0; i < 1000; ++i) {
State state = context.getState(State::Positions | State::Velocities);
double time = state.getTime();
double expectedDist = 1.5+0.5*std::exp(-0.05*time)*std::cos(freq*time);
ASSERT_EQUAL_VEC(Vec3(-0.5*expectedDist, 0, 0), state.getPositions()[0], 0.02);
ASSERT_EQUAL_VEC(Vec3(0.5*expectedDist, 0, 0), state.getPositions()[1], 0.02);
double expectedSpeed = -0.5*std::exp(-0.05*time)*(0.05*std::cos(freq*time)+freq*std::sin(freq*time));
ASSERT_EQUAL_VEC(Vec3(-0.5*expectedSpeed, 0, 0), state.getVelocities()[0], 0.02);
ASSERT_EQUAL_VEC(Vec3(0.5*expectedSpeed, 0, 0), state.getVelocities()[1], 0.02);
integrator.step(1);
}
// Not set the friction to a tiny value and see if it conserves energy.
integrator.setFriction(5e-5);
context.setPositions(positions);
State state = context.getState(State::Energy);
double initialEnergy = state.getKineticEnergy()+state.getPotentialEnergy();
for (int i = 0; i < 1000; ++i) {
state = context.getState(State::Energy);
double energy = state.getKineticEnergy()+state.getPotentialEnergy();
ASSERT_EQUAL_TOL(initialEnergy, energy, 0.01);
integrator.step(1);
}
}
void testTemperature() {
const int numParticles = 8;
const double temp = 100.0;
CpuPlatform platform;
System system;
LangevinIntegrator integrator(temp, 2.0, 0.01);
NonbondedForce* forceField = new NonbondedForce();
for (int i = 0; i < numParticles; ++i) {
system.addParticle(2.0);
forceField->addParticle((i%2 == 0 ? 1.0 : -1.0), 1.0, 5.0);
}
system.addForce(forceField);
Context context(system, integrator, platform);
vector<Vec3> positions(numParticles);
for (int i = 0; i < numParticles; ++i)
positions[i] = Vec3((i%2 == 0 ? 2 : -2), (i%4 < 2 ? 2 : -2), (i < 4 ? 2 : -2));
context.setPositions(positions);
// Let it equilibrate.
integrator.step(10000);
// Now run it for a while and see if the temperature is correct.
double ke = 0.0;
for (int i = 0; i < 10000; ++i) {
State state = context.getState(State::Energy);
ke += state.getKineticEnergy();
integrator.step(1);
}
ke /= 10000;
double expected = 0.5*numParticles*3*BOLTZ*temp;
ASSERT_USUALLY_EQUAL_TOL(expected, ke, 6/std::sqrt(10000.0));
}
void testConstraints() {
const int numParticles = 8;
const double temp = 100.0;
CpuPlatform platform;
System system;
LangevinIntegrator integrator(temp, 2.0, 0.01);
integrator.setConstraintTolerance(1e-5);
NonbondedForce* forceField = new NonbondedForce();
for (int i = 0; i < numParticles; ++i) {
system.addParticle(10.0);
forceField->addParticle((i%2 == 0 ? 0.2 : -0.2), 0.5, 5.0);
}
for (int i = 0; i < numParticles-1; ++i)
system.addConstraint(i, i+1, 1.0);
system.addForce(forceField);
Context context(system, integrator, platform);
vector<Vec3> positions(numParticles);
vector<Vec3> velocities(numParticles);
OpenMM_SFMT::SFMT sfmt;
init_gen_rand(0, sfmt);
for (int i = 0; i < numParticles; ++i) {
positions[i] = Vec3(i/2, (i+1)/2, 0);
velocities[i] = Vec3(genrand_real2(sfmt)-0.5, genrand_real2(sfmt)-0.5, genrand_real2(sfmt)-0.5);
}
context.setPositions(positions);
context.setVelocities(velocities);
// Simulate it and see whether the constraints remain satisfied.
for (int i = 0; i < 1000; ++i) {
State state = context.getState(State::Positions);
for (int j = 0; j < numParticles-1; ++j) {
Vec3 p1 = state.getPositions()[j];
Vec3 p2 = state.getPositions()[j+1];
double dist = std::sqrt((p1[0]-p2[0])*(p1[0]-p2[0])+(p1[1]-p2[1])*(p1[1]-p2[1])+(p1[2]-p2[2])*(p1[2]-p2[2]));
ASSERT_EQUAL_TOL(1.0, dist, 2e-5);
}
integrator.step(1);
}
}
void testConstrainedMasslessParticles() {
CpuPlatform platform;
System system;
system.addParticle(0.0);
system.addParticle(1.0);
system.addConstraint(0, 1, 1.5);
vector<Vec3> positions(2);
positions[0] = Vec3(-1, 0, 0);
positions[1] = Vec3(1, 0, 0);
LangevinIntegrator integrator(300.0, 2.0, 0.01);
bool failed = false;
try {
// This should throw an exception.
Context context(system, integrator, platform);
}
catch (exception& ex) {
failed = true;
}
ASSERT(failed);
// Now make both particles massless, which should work.
system.setParticleMass(1, 0.0);
Context context(system, integrator, platform);
context.setPositions(positions);
context.setVelocitiesToTemperature(300.0);
integrator.step(1);
State state = context.getState(State::Velocities | State::Positions);
ASSERT_EQUAL(0.0, state.getVelocities()[0][0]);
}
void testRandomSeed() {
const int numParticles = 8;
const double temp = 100.0;
const double collisionFreq = 10.0;
CpuPlatform platform;
System system;
LangevinIntegrator integrator(temp, 2.0, 0.01);
NonbondedForce* forceField = new NonbondedForce();
for (int i = 0; i < numParticles; ++i) {
system.addParticle(2.0);
forceField->addParticle((i%2 == 0 ? 1.0 : -1.0), 1.0, 5.0);
}
system.addForce(forceField);
vector<Vec3> positions(numParticles);
vector<Vec3> velocities(numParticles);
for (int i = 0; i < numParticles; ++i) {
positions[i] = Vec3((i%2 == 0 ? 2 : -2), (i%4 < 2 ? 2 : -2), (i < 4 ? 2 : -2));
velocities[i] = Vec3(0, 0, 0);
}
// Try twice with the same random seed.
integrator.setRandomNumberSeed(5);
Context context(system, integrator, platform);
context.setPositions(positions);
context.setVelocities(velocities);
integrator.step(10);
State state1 = context.getState(State::Positions);
context.reinitialize();
context.setPositions(positions);
context.setVelocities(velocities);
integrator.step(10);
State state2 = context.getState(State::Positions);
// Try twice with a different random seed.
integrator.setRandomNumberSeed(10);
context.reinitialize();
context.setPositions(positions);
context.setVelocities(velocities);
integrator.step(10);
State state3 = context.getState(State::Positions);
context.reinitialize();
context.setPositions(positions);
context.setVelocities(velocities);
integrator.step(10);
State state4 = context.getState(State::Positions);
// Compare the results.
for (int i = 0; i < numParticles; i++) {
for (int j = 0; j < 3; j++) {
ASSERT_EQUAL_TOL(state1.getPositions()[i][j], state2.getPositions()[i][j], 1e-5);
ASSERT_EQUAL_TOL(state3.getPositions()[i][j], state4.getPositions()[i][j], 1e-5);
ASSERT(state1.getPositions()[i][j] != state3.getPositions()[i][j]);
}
}
}
int main() {
try {
if (!CpuPlatform::isProcessorSupported()) {
cout << "CPU is not supported. Exiting." << endl;
return 0;
}
testSingleBond();
testTemperature();
testConstraints();
testConstrainedMasslessParticles();
testRandomSeed();
}
catch(const exception& e) {
cout << "exception: " << e.what() << endl;
return 1;
}
cout << "Done" << endl;
return 0;
}
......@@ -34,6 +34,8 @@
*/
#include "openmm/internal/AssertionUtilities.h"
#include "openmm/internal/ThreadPool.h"
#include "AlignedArray.h"
#include "CpuNeighborList.h"
#include "CpuPlatform.h"
#include "sfmt/SFMT.h"
......@@ -49,9 +51,10 @@ void testNeighborList(bool periodic) {
const int numParticles = 500;
const float cutoff = 2.0f;
const float boxSize[3] = {20.0f, 15.0f, 22.0f};
const int blockSize = 8;
OpenMM_SFMT::SFMT sfmt;
init_gen_rand(0, sfmt);
vector<float> positions(4*numParticles);
AlignedArray<float> positions(4*numParticles);
for (int i = 0; i < 4*numParticles; i++)
if (i%4 < 3)
positions[i] = boxSize[i%4]*genrand_real2(sfmt);
......@@ -63,15 +66,16 @@ void testNeighborList(bool periodic) {
exclusions[i-j].insert(i);
}
}
CpuNeighborList neighborList;
neighborList.computeNeighborList(numParticles, positions, exclusions, boxSize, periodic, cutoff);
ThreadPool threads;
CpuNeighborList neighborList(blockSize);
neighborList.computeNeighborList(numParticles, positions, exclusions, boxSize, periodic, cutoff, threads);
// Convert the neighbor list to a set for faster lookup.
set<pair<int, int> > neighbors;
for (int i = 0; i < (int) neighborList.getSortedAtoms().size(); i++) {
int blockIndex = i/CpuNeighborList::BlockSize;
int indexInBlock = i-blockIndex*CpuNeighborList::BlockSize;
int blockIndex = i/blockSize;
int indexInBlock = i-blockIndex*blockSize;
char mask = 1<<indexInBlock;
for (int j = 0; j < (int) neighborList.getBlockExclusions(blockIndex).size(); j++) {
if ((neighborList.getBlockExclusions(blockIndex)[j] & mask) == 0) {
......
......@@ -397,44 +397,44 @@ void testLargeSystem() {
system.addForce(bonds);
VerletIntegrator integrator1(0.01);
VerletIntegrator integrator2(0.01);
Context cuContext(system, integrator1, platform);
Context cpuContext(system, integrator1, platform);
Context referenceContext(system, integrator2, reference);
cuContext.setPositions(positions);
cuContext.setVelocities(velocities);
cpuContext.setPositions(positions);
cpuContext.setVelocities(velocities);
referenceContext.setPositions(positions);
referenceContext.setVelocities(velocities);
State cuState = cuContext.getState(State::Positions | State::Velocities | State::Forces | State::Energy);
State cpuState = cpuContext.getState(State::Positions | State::Velocities | State::Forces | State::Energy);
State referenceState = referenceContext.getState(State::Positions | State::Velocities | State::Forces | State::Energy);
for (int i = 0; i < numParticles; i++) {
ASSERT_EQUAL_VEC(cuState.getPositions()[i], referenceState.getPositions()[i], tol);
ASSERT_EQUAL_VEC(cuState.getVelocities()[i], referenceState.getVelocities()[i], tol);
ASSERT_EQUAL_VEC(cuState.getForces()[i], referenceState.getForces()[i], tol);
ASSERT_EQUAL_VEC(cpuState.getPositions()[i], referenceState.getPositions()[i], tol);
ASSERT_EQUAL_VEC(cpuState.getVelocities()[i], referenceState.getVelocities()[i], tol);
ASSERT_EQUAL_VEC(cpuState.getForces()[i], referenceState.getForces()[i], tol);
}
ASSERT_EQUAL_TOL(cuState.getPotentialEnergy(), referenceState.getPotentialEnergy(), tol);
ASSERT_EQUAL_TOL(cpuState.getPotentialEnergy(), referenceState.getPotentialEnergy(), tol);
// Now do the same thing with periodic boundary conditions.
nonbonded->setNonbondedMethod(NonbondedForce::CutoffPeriodic);
system.setDefaultPeriodicBoxVectors(Vec3(boxSize, 0, 0), Vec3(0, boxSize, 0), Vec3(0, 0, boxSize));
cuContext.reinitialize();
cpuContext.reinitialize();
referenceContext.reinitialize();
cuContext.setPositions(positions);
cuContext.setVelocities(velocities);
cpuContext.setPositions(positions);
cpuContext.setVelocities(velocities);
referenceContext.setPositions(positions);
referenceContext.setVelocities(velocities);
cuState = cuContext.getState(State::Positions | State::Velocities | State::Forces | State::Energy);
cpuState = cpuContext.getState(State::Positions | State::Velocities | State::Forces | State::Energy);
referenceState = referenceContext.getState(State::Positions | State::Velocities | State::Forces | State::Energy);
for (int i = 0; i < numParticles; i++) {
double dx = cuState.getPositions()[i][0]-referenceState.getPositions()[i][0];
double dy = cuState.getPositions()[i][1]-referenceState.getPositions()[i][1];
double dz = cuState.getPositions()[i][2]-referenceState.getPositions()[i][2];
ASSERT_EQUAL_TOL(fmod(cuState.getPositions()[i][0]-referenceState.getPositions()[i][0], boxSize), 0, tol);
ASSERT_EQUAL_TOL(fmod(cuState.getPositions()[i][1]-referenceState.getPositions()[i][1], boxSize), 0, tol);
ASSERT_EQUAL_TOL(fmod(cuState.getPositions()[i][2]-referenceState.getPositions()[i][2], boxSize), 0, tol);
ASSERT_EQUAL_VEC(cuState.getVelocities()[i], referenceState.getVelocities()[i], tol);
ASSERT_EQUAL_VEC(cuState.getForces()[i], referenceState.getForces()[i], tol);
double dx = cpuState.getPositions()[i][0]-referenceState.getPositions()[i][0];
double dy = cpuState.getPositions()[i][1]-referenceState.getPositions()[i][1];
double dz = cpuState.getPositions()[i][2]-referenceState.getPositions()[i][2];
ASSERT_EQUAL_TOL(fmod(cpuState.getPositions()[i][0]-referenceState.getPositions()[i][0], boxSize), 0, tol);
ASSERT_EQUAL_TOL(fmod(cpuState.getPositions()[i][1]-referenceState.getPositions()[i][1], boxSize), 0, tol);
ASSERT_EQUAL_TOL(fmod(cpuState.getPositions()[i][2]-referenceState.getPositions()[i][2], boxSize), 0, tol);
ASSERT_EQUAL_VEC(cpuState.getVelocities()[i], referenceState.getVelocities()[i], tol);
ASSERT_EQUAL_VEC(cpuState.getForces()[i], referenceState.getForces()[i], tol);
}
ASSERT_EQUAL_TOL(cuState.getPotentialEnergy(), referenceState.getPotentialEnergy(), tol);
ASSERT_EQUAL_TOL(cpuState.getPotentialEnergy(), referenceState.getPotentialEnergy(), tol);
}
void testDispersionCorrection() {
......@@ -542,15 +542,15 @@ void testChangingParameters() {
VerletIntegrator integrator1(0.01);
VerletIntegrator integrator2(0.01);
Context cuContext(system, integrator1, platform);
Context cpuContext(system, integrator1, platform);
Context referenceContext(system, integrator2, reference);
cuContext.setPositions(positions);
cpuContext.setPositions(positions);
referenceContext.setPositions(positions);
State cuState = cuContext.getState(State::Forces | State::Energy);
State cpuState = cpuContext.getState(State::Forces | State::Energy);
State referenceState = referenceContext.getState(State::Forces | State::Energy);
for (int i = 0; i < numParticles; i++)
ASSERT_EQUAL_VEC(cuState.getForces()[i], referenceState.getForces()[i], tol);
ASSERT_EQUAL_TOL(cuState.getPotentialEnergy(), referenceState.getPotentialEnergy(), tol);
ASSERT_EQUAL_VEC(cpuState.getForces()[i], referenceState.getForces()[i], tol);
ASSERT_EQUAL_TOL(cpuState.getPotentialEnergy(), referenceState.getPotentialEnergy(), tol);
// Now modify parameters and see if they still agree.
......@@ -559,13 +559,13 @@ void testChangingParameters() {
nonbonded->getParticleParameters(i, charge, sigma, epsilon);
nonbonded->setParticleParameters(i, 1.5*charge, 1.1*sigma, 1.7*epsilon);
}
nonbonded->updateParametersInContext(cuContext);
nonbonded->updateParametersInContext(cpuContext);
nonbonded->updateParametersInContext(referenceContext);
cuState = cuContext.getState(State::Forces | State::Energy);
cpuState = cpuContext.getState(State::Forces | State::Energy);
referenceState = referenceContext.getState(State::Forces | State::Energy);
for (int i = 0; i < numParticles; i++)
ASSERT_EQUAL_VEC(cuState.getForces()[i], referenceState.getForces()[i], tol);
ASSERT_EQUAL_TOL(cuState.getPotentialEnergy(), referenceState.getPotentialEnergy(), tol);
ASSERT_EQUAL_VEC(cpuState.getForces()[i], referenceState.getForces()[i], tol);
ASSERT_EQUAL_TOL(cpuState.getPotentialEnergy(), referenceState.getPotentialEnergy(), tol);
}
void testSwitchingFunction(NonbondedForce::NonbondedMethod method) {
......
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008-2014 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
/**
* This tests the CPU implementation of PeriodicTorsionForce.
*/
#include "openmm/internal/AssertionUtilities.h"
#include "openmm/Context.h"
#include "CpuPlatform.h"
#include "openmm/PeriodicTorsionForce.h"
#include "openmm/System.h"
#include "openmm/VerletIntegrator.h"
#include "SimTKOpenMMRealType.h"
#include <iostream>
#include <vector>
using namespace OpenMM;
using namespace std;
CpuPlatform platform;
const double TOL = 1e-5;
void testPeriodicTorsions() {
System system;
system.addParticle(1.0);
system.addParticle(1.0);
system.addParticle(1.0);
system.addParticle(1.0);
VerletIntegrator integrator(0.01);
PeriodicTorsionForce* forceField = new PeriodicTorsionForce();
forceField->addTorsion(0, 1, 2, 3, 2, PI_M/3, 1.1);
system.addForce(forceField);
Context context(system, integrator, platform);
vector<Vec3> positions(4);
positions[0] = Vec3(0, 1, 0);
positions[1] = Vec3(0, 0, 0);
positions[2] = Vec3(1, 0, 0);
positions[3] = Vec3(1, 0, 2);
context.setPositions(positions);
State state = context.getState(State::Forces | State::Energy);
{
const vector<Vec3>& forces = state.getForces();
double torque = -2*1.1*std::sin(2*PI_M/3);
ASSERT_EQUAL_VEC(Vec3(0, 0, torque), forces[0], TOL);
ASSERT_EQUAL_VEC(Vec3(0, 0.5*torque, 0), forces[3], TOL);
ASSERT_EQUAL_VEC(Vec3(forces[0][0]+forces[1][0]+forces[2][0]+forces[3][0], forces[0][1]+forces[1][1]+forces[2][1]+forces[3][1], forces[0][2]+forces[1][2]+forces[2][2]+forces[3][2]), Vec3(0, 0, 0), TOL);
ASSERT_EQUAL_TOL(1.1*(1+std::cos(2*PI_M/3)), state.getPotentialEnergy(), TOL);
}
// Try changing the torsion parameters and make sure it's still correct.
forceField->setTorsionParameters(0, 0, 1, 2, 3, 3, PI_M/3.2, 1.3);
forceField->updateParametersInContext(context);
state = context.getState(State::Forces | State::Energy);
{
const vector<Vec3>& forces = state.getForces();
double dtheta = (3*PI_M/2)-(PI_M/3.2);
double torque = -3*1.3*std::sin(dtheta);
ASSERT_EQUAL_VEC(Vec3(0, 0, torque), forces[0], TOL);
ASSERT_EQUAL_VEC(Vec3(0, 0.5*torque, 0), forces[3], TOL);
ASSERT_EQUAL_VEC(Vec3(forces[0][0]+forces[1][0]+forces[2][0]+forces[3][0], forces[0][1]+forces[1][1]+forces[2][1]+forces[3][1], forces[0][2]+forces[1][2]+forces[2][2]+forces[3][2]), Vec3(0, 0, 0), TOL);
ASSERT_EQUAL_TOL(1.3*(1+std::cos(dtheta)), state.getPotentialEnergy(), TOL);
}
}
void testParallelComputation() {
System system;
const int numParticles = 200;
for (int i = 0; i < numParticles; i++)
system.addParticle(1.0);
PeriodicTorsionForce* force = new PeriodicTorsionForce();
for (int i = 3; i < numParticles; i++)
force->addTorsion(i-3, i-2, i-1, i, 2, 1.1, i);
system.addForce(force);
vector<Vec3> positions(numParticles);
for (int i = 0; i < numParticles; i++)
positions[i] = Vec3(i, i%2, i%3);
VerletIntegrator integrator1(0.01);
ReferencePlatform reference;
Context context1(system, integrator1, reference);
context1.setPositions(positions);
State state1 = context1.getState(State::Forces | State::Energy);
VerletIntegrator integrator2(0.01);
Context context2(system, integrator2, platform);
context2.setPositions(positions);
State state2 = context2.getState(State::Forces | State::Energy);
ASSERT_EQUAL_TOL(state1.getPotentialEnergy(), state2.getPotentialEnergy(), 1e-5);
for (int i = 0; i < numParticles; i++)
ASSERT_EQUAL_VEC(state1.getForces()[i], state2.getForces()[i], 1e-5);
}
int main(int argc, char* argv[]) {
try {
testPeriodicTorsions();
testParallelComputation();
}
catch(const exception& e) {
cout << "exception: " << e.what() << endl;
return 1;
}
cout << "Done" << endl;
return 0;
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008-2014 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
/**
* This tests the CUDA implementation of RBTorsionForce.
*/
#include "openmm/internal/AssertionUtilities.h"
#include "openmm/Context.h"
#include "CpuPlatform.h"
#include "openmm/RBTorsionForce.h"
#include "openmm/System.h"
#include "openmm/VerletIntegrator.h"
#include "SimTKOpenMMRealType.h"
#include <iostream>
#include <vector>
using namespace OpenMM;
using namespace std;
CpuPlatform platform;
const double TOL = 1e-5;
void testRBTorsions() {
System system;
system.addParticle(1.0);
system.addParticle(1.0);
system.addParticle(1.0);
system.addParticle(1.0);
VerletIntegrator integrator(0.01);
RBTorsionForce* forceField = new RBTorsionForce();
forceField->addTorsion(0, 1, 2, 3, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6);
system.addForce(forceField);
Context context(system, integrator, platform);
vector<Vec3> positions(4);
positions[0] = Vec3(0, 1, 0);
positions[1] = Vec3(0, 0, 0);
positions[2] = Vec3(1, 0, 0);
positions[3] = Vec3(1, 1, 1);
context.setPositions(positions);
State state = context.getState(State::Forces | State::Energy);
{
const vector<Vec3>& forces = state.getForces();
double psi = 0.25*PI_M - PI_M;
double torque = 0.0;
for (int i = 1; i < 6; ++i) {
double c = 0.1*(i+1);
torque += -c*i*std::pow(std::cos(psi), i-1)*std::sin(psi);
}
ASSERT_EQUAL_VEC(Vec3(0, 0, torque), forces[0], TOL);
ASSERT_EQUAL_VEC(Vec3(0, 0.5*torque, -0.5*torque), forces[3], TOL);
ASSERT_EQUAL_VEC(Vec3(forces[0][0]+forces[1][0]+forces[2][0]+forces[3][0], forces[0][1]+forces[1][1]+forces[2][1]+forces[3][1], forces[0][2]+forces[1][2]+forces[2][2]+forces[3][2]), Vec3(0, 0, 0), TOL);
double energy = 0.0;
for (int i = 0; i < 6; ++i) {
double c = 0.1*(i+1);
energy += c*std::pow(std::cos(psi), i);
}
ASSERT_EQUAL_TOL(energy, state.getPotentialEnergy(), TOL);
}
// Try changing the torsion parameters and make sure it's still correct.
forceField->setTorsionParameters(0, 0, 1, 2, 3, 0.11, 0.22, 0.33, 0.44, 0.55, 0.66);
forceField->updateParametersInContext(context);
state = context.getState(State::Forces | State::Energy);
{
const vector<Vec3>& forces = state.getForces();
double psi = 0.25*PI_M - PI_M;
double torque = 0.0;
for (int i = 1; i < 6; ++i) {
double c = 0.11*(i+1);
torque += -c*i*std::pow(std::cos(psi), i-1)*std::sin(psi);
}
ASSERT_EQUAL_VEC(Vec3(0, 0, torque), forces[0], TOL);
ASSERT_EQUAL_VEC(Vec3(0, 0.5*torque, -0.5*torque), forces[3], TOL);
ASSERT_EQUAL_VEC(Vec3(forces[0][0]+forces[1][0]+forces[2][0]+forces[3][0], forces[0][1]+forces[1][1]+forces[2][1]+forces[3][1], forces[0][2]+forces[1][2]+forces[2][2]+forces[3][2]), Vec3(0, 0, 0), TOL);
double energy = 0.0;
for (int i = 0; i < 6; ++i) {
double c = 0.11*(i+1);
energy += c*std::pow(std::cos(psi), i);
}
ASSERT_EQUAL_TOL(energy, state.getPotentialEnergy(), TOL);
}
}
void testParallelComputation() {
System system;
const int numParticles = 200;
for (int i = 0; i < numParticles; i++)
system.addParticle(1.0);
RBTorsionForce* force = new RBTorsionForce();
for (int i = 3; i < numParticles; i++)
force->addTorsion(i-3, i-2, i-1, i, 2, 0.1*i, 0.5*i, i, 1, 1);
system.addForce(force);
vector<Vec3> positions(numParticles);
for (int i = 0; i < numParticles; i++)
positions[i] = Vec3(i, i%2, i%3);
VerletIntegrator integrator1(0.01);
ReferencePlatform reference;
Context context1(system, integrator1, reference);
context1.setPositions(positions);
State state1 = context1.getState(State::Forces | State::Energy);
VerletIntegrator integrator2(0.01);
Context context2(system, integrator2, platform);
context2.setPositions(positions);
State state2 = context2.getState(State::Forces | State::Energy);
ASSERT_EQUAL_TOL(state1.getPotentialEnergy(), state2.getPotentialEnergy(), 1e-5);
for (int i = 0; i < numParticles; i++)
ASSERT_EQUAL_VEC(state1.getForces()[i], state2.getForces()[i], 1e-5);
}
int main(int argc, char* argv[]) {
try {
testRBTorsions();
testParallelComputation();
}
catch(const exception& e) {
cout << "exception: " << e.what() << endl;
return 1;
}
cout << "Done" << endl;
return 0;
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment