Commit 222b3bb4 authored by peastman's avatar peastman
Browse files

Began CUDA version of GayBerneForce

parent a381a3ab
...@@ -1104,6 +1104,74 @@ private: ...@@ -1104,6 +1104,74 @@ private:
CUevent event; CUevent event;
}; };
/**
* This kernel is invoked by GayBerneForce to calculate the forces acting on the system.
*/
class CudaCalcGayBerneForceKernel : public CalcGayBerneForceKernel {
public:
CudaCalcGayBerneForceKernel(std::string name, const Platform& platform, CudaContext& cu) : CalcGayBerneForceKernel(name, platform), cu(cu),
hasInitializedKernels(false), sortedParticles(NULL), axisParticleIndices(NULL), sigParams(NULL), epsParams(NULL), scale(NULL), exceptionParticles(NULL),
exceptionParams(NULL), aMatrix(NULL),
bMatrix(NULL), gMatrix(NULL), exclusions(NULL), exclusionStartIndex(NULL), blockCenter(NULL), blockBoundingBox(NULL), neighbors(NULL),
neighborIndex(NULL), neighborBlockCount(NULL), sortedPos(NULL), torque(NULL) {
}
~CudaCalcGayBerneForceKernel();
/**
* Initialize the kernel.
*
* @param system the System this kernel will be applied to
* @param force the GayBerneForce this kernel will be used for
*/
void initialize(const System& system, const GayBerneForce& force);
/**
* Execute the kernel to calculate the forces and/or energy.
*
* @param context the context in which to execute this kernel
* @param includeForces true if forces should be calculated
* @return the potential energy due to the force
*/
double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
/**
* Copy changed parameters over to a context.
*
* @param context the context to copy parameters to
* @param force the GayBerneForce to copy the parameters from
*/
void copyParametersToContext(ContextImpl& context, const GayBerneForce& force);
private:
class ReorderListener;
void sortAtoms();
CudaContext& cu;
bool hasInitializedKernels;
int numRealParticles, numExceptions, maxNeighborBlocks;
GayBerneForce::NonbondedMethod nonbondedMethod;
CudaArray* sortedParticles;
CudaArray* axisParticleIndices;
CudaArray* sigParams;
CudaArray* epsParams;
CudaArray* scale;
CudaArray* exceptionParticles;
CudaArray* exceptionParams;
CudaArray* aMatrix;
CudaArray* bMatrix;
CudaArray* gMatrix;
CudaArray* exclusions;
CudaArray* exclusionStartIndex;
CudaArray* blockCenter;
CudaArray* blockBoundingBox;
CudaArray* neighbors;
CudaArray* neighborIndex;
CudaArray* neighborBlockCount;
CudaArray* sortedPos;
CudaArray* torque;
std::vector<bool> isRealParticle;
std::vector<std::pair<int, int> > exceptionAtoms;
std::vector<std::pair<int, int> > excludedPairs;
std::vector<void*> framesArgs, blockBoundsArgs, neighborsArgs, forceArgs, torqueArgs;
CUfunction framesKernel, blockBoundsKernel, neighborsKernel, forceKernel, torqueKernel;
CUevent event;
};
/** /**
* This kernel is invoked by VerletIntegrator to take one time step. * This kernel is invoked by VerletIntegrator to take one time step.
*/ */
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2008-2012 Stanford University and the Authors. * * Portions copyright (c) 2008-2016 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -110,6 +110,8 @@ KernelImpl* CudaKernelFactory::createKernelImpl(std::string name, const Platform ...@@ -110,6 +110,8 @@ KernelImpl* CudaKernelFactory::createKernelImpl(std::string name, const Platform
return new CudaCalcCustomCompoundBondForceKernel(name, platform, cu, context.getSystem()); return new CudaCalcCustomCompoundBondForceKernel(name, platform, cu, context.getSystem());
if (name == CalcCustomManyParticleForceKernel::Name()) if (name == CalcCustomManyParticleForceKernel::Name())
return new CudaCalcCustomManyParticleForceKernel(name, platform, cu, context.getSystem()); return new CudaCalcCustomManyParticleForceKernel(name, platform, cu, context.getSystem());
if (name == CalcGayBerneForceKernel::Name())
return new CudaCalcGayBerneForceKernel(name, platform, cu);
if (name == IntegrateVerletStepKernel::Name()) if (name == IntegrateVerletStepKernel::Name())
return new CudaIntegrateVerletStepKernel(name, platform, cu); return new CudaIntegrateVerletStepKernel(name, platform, cu);
if (name == IntegrateLangevinStepKernel::Name()) if (name == IntegrateLangevinStepKernel::Name())
......
...@@ -5988,6 +5988,431 @@ void CudaCalcCustomManyParticleForceKernel::copyParametersToContext(ContextImpl& ...@@ -5988,6 +5988,431 @@ void CudaCalcCustomManyParticleForceKernel::copyParametersToContext(ContextImpl&
cu.invalidateMolecules(); cu.invalidateMolecules();
} }
class CudaGayBerneForceInfo : public CudaForceInfo {
public:
CudaGayBerneForceInfo(const GayBerneForce& force) : force(force) {
}
bool areParticlesIdentical(int particle1, int particle2) {
int xparticle1, yparticle1;
double sigma1, epsilon1, sx1, sy1, sz1, ex1, ey1, ez1;
int xparticle2, yparticle2;
double sigma2, epsilon2, sx2, sy2, sz2, ex2, ey2, ez2;
force.getParticleParameters(particle1, sigma1, epsilon1, xparticle1, yparticle1, sx1, sy1, sz1, ex1, ey1, ez1);
force.getParticleParameters(particle2, sigma2, epsilon2, xparticle2, yparticle2, sx2, sy2, sz2, ex2, ey2, ez2);
return (sigma1 == sigma2 && epsilon1 == epsilon2 && sx1 == sx2 && sy1 == sy2 && sz1 == sz2 && ex1 == ex2 && ey1 == ey2 && ez1 == ez2);
}
int getNumParticleGroups() {
return force.getNumExceptions()+force.getNumParticles();
}
void getParticlesInGroup(int index, vector<int>& particles) {
if (index < force.getNumExceptions()) {
int particle1, particle2;
double sigma, epsilon;
force.getExceptionParameters(index, particle1, particle2, sigma, epsilon);
particles.resize(2);
particles[0] = particle1;
particles[1] = particle2;
}
else {
int particle = index-force.getNumExceptions();
int xparticle, yparticle;
double sigma, epsilon, sx, sy, sz, ex, ey, ez;
force.getParticleParameters(particle, sigma, epsilon, xparticle, yparticle, sx, sy, sz, ex, ey, ez);
particles.clear();
particles.push_back(particle);
if (xparticle > -1)
particles.push_back(xparticle);
if (yparticle > -1)
particles.push_back(yparticle);
}
}
bool areGroupsIdentical(int group1, int group2) {
if (group1 < force.getNumExceptions() && group2 < force.getNumExceptions()) {
int particle1, particle2;
double sigma1, sigma2, epsilon1, epsilon2;
force.getExceptionParameters(group1, particle1, particle2, sigma1, epsilon1);
force.getExceptionParameters(group2, particle1, particle2, sigma2, epsilon2);
return (sigma1 == sigma2 && epsilon1 == epsilon2);
}
return true;
}
private:
const GayBerneForce& force;
};
class CudaCalcGayBerneForceKernel::ReorderListener : public CudaContext::ReorderListener {
public:
ReorderListener(CudaCalcGayBerneForceKernel& owner) : owner(owner) {
}
void execute() {
owner.sortAtoms();
}
private:
CudaCalcGayBerneForceKernel& owner;
};
CudaCalcGayBerneForceKernel::~CudaCalcGayBerneForceKernel() {
if (sortedParticles != NULL)
delete sortedParticles;
if (axisParticleIndices != NULL)
delete axisParticleIndices;
if (sigParams != NULL)
delete sigParams;
if (epsParams != NULL)
delete epsParams;
if (scale != NULL)
delete scale;
if (exceptionParticles != NULL)
delete exceptionParticles;
if (exceptionParams != NULL)
delete exceptionParams;
if (aMatrix != NULL)
delete aMatrix;
if (bMatrix != NULL)
delete bMatrix;
if (gMatrix != NULL)
delete gMatrix;
if (exclusions != NULL)
delete exclusions;
if (exclusionStartIndex != NULL)
delete exclusionStartIndex;
if (blockCenter != NULL)
delete blockCenter;
if (blockBoundingBox != NULL)
delete blockBoundingBox;
if (neighbors != NULL)
delete neighbors;
if (neighborIndex != NULL)
delete neighborIndex;
if (neighborBlockCount != NULL)
delete neighborBlockCount;
if (sortedPos != NULL)
delete sortedPos;
if (torque != NULL)
delete torque;
}
void CudaCalcGayBerneForceKernel::initialize(const System& system, const GayBerneForce& force) {
// Initialize interactions.
int numParticles = force.getNumParticles();
sigParams = CudaArray::create<float4>(cu, cu.getPaddedNumAtoms(), "sigParams");
epsParams = CudaArray::create<float2>(cu, cu.getPaddedNumAtoms(), "epsParams");
scale = CudaArray::create<float4>(cu, cu.getPaddedNumAtoms(), "scale");
axisParticleIndices = CudaArray::create<int2>(cu, cu.getPaddedNumAtoms(), "axisParticleIndices");
sortedParticles = CudaArray::create<int>(cu, cu.getPaddedNumAtoms(), "sortedParticles");
aMatrix = CudaArray::create<float>(cu, 9*cu.getPaddedNumAtoms(), "aMatrix");
bMatrix = CudaArray::create<float>(cu, 9*cu.getPaddedNumAtoms(), "bMatrix");
gMatrix = CudaArray::create<float>(cu, 9*cu.getPaddedNumAtoms(), "gMatrix");
vector<float4> sigParamsVector(cu.getPaddedNumAtoms(), make_float4(0, 0, 0, 0));
vector<float2> epsParamsVector(cu.getPaddedNumAtoms(), make_float2(0, 0));
vector<float4> scaleVector(cu.getPaddedNumAtoms(), make_float4(0, 0, 0, 0));
vector<int2> axisParticleVector(cu.getPaddedNumAtoms(), make_int2(0, 0));
isRealParticle.resize(cu.getPaddedNumAtoms());
for (int i = 0; i < numParticles; i++) {
int xparticle, yparticle;
double sigma, epsilon, sx, sy, sz, ex, ey, ez;
force.getParticleParameters(i, sigma, epsilon, xparticle, yparticle, sx, sy, sz, ex, ey, ez);
axisParticleVector[i] = make_int2(xparticle, yparticle);
sigParamsVector[i] = make_float4((float) (0.5*sigma), (float) (0.25*sx*sx), (float) (0.25*sy*sy), (float) (0.25*sz*sz));
epsParamsVector[i] = make_float2((float) sqrt(epsilon), (float) (0.125*(sx*sy + sz*sz)*sqrt(sx*sy)));
scaleVector[i] = make_float4((float) (1/sqrt(ex)), (float) (1/sqrt(ey)), (float) (1/sqrt(ez)), 0);
isRealParticle[i] = (epsilon != 0.0);
}
sigParams->upload(sigParamsVector);
epsParams->upload(epsParamsVector);
scale->upload(scaleVector);
axisParticleIndices->upload(axisParticleVector);
// Record exceptions and exclusions.
vector<float2> exceptionParamsVec;
for (int i = 0; i < force.getNumExceptions(); i++) {
int particle1, particle2;
double sigma, epsilon;
force.getExceptionParameters(i, particle1, particle2, sigma, epsilon);
if (epsilon != 0.0) {
exceptionParamsVec.push_back(make_float2((float) sigma, (float) epsilon));
exceptionAtoms.push_back(make_pair(particle1, particle2));
isRealParticle[particle1] = true;
isRealParticle[particle2] = true;
}
if (isRealParticle[particle1] && isRealParticle[particle2])
excludedPairs.push_back(pair<int, int>(particle1, particle2));
}
numRealParticles = 0;
for (int i = 0; i < isRealParticle.size(); i++)
if (isRealParticle[i])
numRealParticles++;
numExceptions = exceptionParamsVec.size();
exclusions = CudaArray::create<int>(cu, max(1, (int) excludedPairs.size()), "exclusions");
exclusionStartIndex = CudaArray::create<int>(cu, numRealParticles+1, "exclusionStartIndex");
exceptionParticles = CudaArray::create<int4>(cu, max(1, numExceptions), "exceptionParticles");
exceptionParams = CudaArray::create<float2>(cu, max(1, numExceptions), "exceptionParams");
if (numExceptions > 0)
exceptionParams->upload(exceptionParamsVec);
// Create data structures used for the neighbor list.
int numAtomBlocks = (numRealParticles+31)/32;
int elementSize = (cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
blockCenter = new CudaArray(cu, numAtomBlocks, 4*elementSize, "blockCenter");
blockBoundingBox = new CudaArray(cu, numAtomBlocks, 4*elementSize, "blockBoundingBox");
sortedPos = new CudaArray(cu, numRealParticles, 4*elementSize, "sortedPos");
maxNeighborBlocks = numRealParticles*2;
neighbors = CudaArray::create<int>(cu, maxNeighborBlocks*32, "neighbors");
neighborIndex = CudaArray::create<int>(cu, maxNeighborBlocks, "neighbors");
neighborBlockCount = CudaArray::create<int>(cu, 1, "neighborBlockCount");
if (force.getNonbondedMethod() != GayberneForce::NoCutoff)
CHECK_RESULT(cuEventCreate(&event, CU_EVENT_DISABLE_TIMING), "Error creating event for CustomManyParticleForce");
// Create array for accumulating torques.
torque = CudaArray::create<long long>(cu, 3*cu.getPaddedNumAtoms(), "torque");
cu.addAutoclearBuffer(*torque);
// Create the kernels.
nonbondedMethod = force.getNonbondedMethod();
bool useCutoff = (nonbondedMethod != GayBerneForce::NoCutoff);
bool usePeriodic = (nonbondedMethod == GayBerneForce::CutoffPeriodic);
map<string, string> defines;
defines["USE_SWITCH"] = (useCutoff && force.getUseSwitchingFunction() ? "1" : "0");
double cutoff = force.getCutoffDistance();
defines["CUTOFF_SQUARED"] = cu.doubleToString(cutoff*cutoff);
if (useCutoff) {
defines["USE_CUTOFF"] = 1;
if (usePeriodic)
defines["USE_PERIODIC"] = "1";
// Compute the switching coefficients.
if (force.getUseSwitchingFunction()) {
defines["SWITCH_CUTOFF"] = cu.doubleToString(force.getSwitchingDistance());
defines["SWITCH_C3"] = cu.doubleToString(10/pow(force.getSwitchingDistance()-cutoff, 3.0));
defines["SWITCH_C4"] = cu.doubleToString(15/pow(force.getSwitchingDistance()-cutoff, 4.0));
defines["SWITCH_C5"] = cu.doubleToString(6/pow(force.getSwitchingDistance()-cutoff, 5.0));
}
}
defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
CUmodule module = cu.createModule(CudakernelSources::vectorOps+CudaKernelSources::gayBerne, defines);
framesKernel = cu.getKernel(module, "computeEllipsoidFrames");
blockBoundsKernel = cu.getKernel(module, "findBlockBounds");
neighborsKernel = cu.getKernel(module, "findNeighbors");
forceKernel = cu.getKernel(module, "computeForce");
torqueKernel = cu.getKernel(module, "applyTorques");
cu.addForce(new CudaGayBerneForceInfo(force));
cu.addReorderListener(new ReorderListener(*this));
}
double CudaCalcGayBerneForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
if (!hasInitializedKernels) {
hasInitializedKernels = true;
sortAtoms();
framesArgs.push_back(&numRealParticles);
framesArgs.push_back(&cu.getPosq().getDevicePointer());
framesArgs.push_back(&axisParticleIndices->getDevicePointer());
framesArgs.push_back(&sigParams->getDevicePointer());
framesArgs.push_back(&scale->getDevicePointer());
framesArgs.push_back(&aMatrix->getDevicePointer());
framesArgs.push_back(&bMatrix->getDevicePointer());
framesArgs.push_back(&gMatrix->getDevicePointer());
framesArgs.push_back(&sortedParticles->getDevicePointer());
blockBoundsArgs.push_back(&numRealParticles);
blockBoundsArgs.push_back(cu.getPeriodicBoxSizePointer());
blockBoundsArgs.push_back(cu.getInvPeriodicBoxSizePointer());
blockBoundsArgs.push_back(cu.getPeriodicBoxVecXPointer());
blockBoundsArgs.push_back(cu.getPeriodicBoxVecYPointer());
blockBoundsArgs.push_back(cu.getPeriodicBoxVecZPointer());
blockBoundsArgs.push_back(&sortedParticles->getDevicePointer());
blockBoundsArgs.push_back(&cu.getPosq().getDevicePointer());
blockBoundsArgs.push_back(&sortedPos->getDevicePointer());
blockBoundsArgs.push_back(&blockCenter->getDevicePointer());
blockBoundsArgs.push_back(&blockBoundingBox->getDevicePointer());
blockBoundsArgs.push_back(&neighborBlockCount->getDevicePointer());
neighborsArgs.push_back(&numRealParticles);
neighborsArgs.push_back(&maxNeighborBlocks);
neighborsArgs.push_back(cu.getPeriodicBoxSizePointer());
neighborsArgs.push_back(cu.getInvPeriodicBoxSizePointer());
neighborsArgs.push_back(cu.getPeriodicBoxVecXPointer());
neighborsArgs.push_back(cu.getPeriodicBoxVecYPointer());
neighborsArgs.push_back(cu.getPeriodicBoxVecZPointer());
neighborsArgs.push_back(&sortedPos->getDevicePointer());
neighborsArgs.push_back(&blockCenter->getDevicePointer());
neighborsArgs.push_back(&blockBoundingBox->getDevicePointer());
neighborsArgs.push_back(&neighbors->getDevicePointer());
neighborsArgs.push_back(&neighborIndex->getDevicePointer());
neighborsArgs.push_back(&neighborBlockCount->getDevicePointer());
neighborsArgs.push_back(&exclusions->getDevicePointer());
neighborsArgs.push_back(&exclusionStartIndex->getDevicePointer());
forceArgs.push_back(&cu.getLongForceBuffer().getDevicePointer());
forceArgs.push_back(&torque->getDevicePointer());
forceArgs.push_back(&numRealParticles);
forceArgs.push_back(&numExceptions);
forceArgs.push_back(&cu.getEnergyBuffer().getDevicePointer());
forceArgs.push_back(&sortedPos->getDevicePointer());
forceArgs.push_back(&sigParams->getDevicePointer());
forceArgs.push_back(&epsParams->getDevicePointer());
forceArgs.push_back(&sortedParticles->getDevicePointer());
forceArgs.push_back(&aMatrix->getDevicePointer());
forceArgs.push_back(&bMatrix->getDevicePointer());
forceArgs.push_back(&gMatrix->getDevicePointer());
forceArgs.push_back(&exclusions->getDevicePointer());
forceArgs.push_back(&exclusionStartIndex->getDevicePointer());
forceArgs.push_back(&exceptionParticles->getDevicePointer());
forceArgs.push_back(&exceptionParams->getDevicePointer());
if (nonbondedMethod != GayBerneForce::NoCutoff) {
forceArgs.push_back(&maxNeighborBlocks);
forceArgs.push_back(&neighbors->getDevicePointer());
forceArgs.push_back(&neighborIndex->getDevicePointer());
forceArgs.push_back(&neighborBlockCount->getDevicePointer());
forceArgs.push_back(cu.getPeriodicBoxSizePointer());
forceArgs.push_back(cu.getInvPeriodicBoxSizePointer());
forceArgs.push_back(cu.getPeriodicBoxVecXPointer());
forceArgs.push_back(cu.getPeriodicBoxVecYPointer());
forceArgs.push_back(cu.getPeriodicBoxVecZPointer());
}
torqueArgs.push_back(&cu.getLongForceBuffer().getDevicePointer());
torqueArgs.push_back(&torque->getDevicePointer());
torqueArgs.push_back(&numRealParticles);
torqueArgs.push_back(&cu.getPosq().getDevicePointer());
torqueArgs.push_back(&axisParticleIndices->getDevicePointer());
torqueArgs.push_back(&sortedParticles->getDevicePointer());
}
cu.executeKernel(framesKernel, &framesArgs[0], numRealParticles);
cu.executeKernel(blockBoundsKernel, &blockBoundsArgs[0], (numRealParticles+31)/32);
if (nonbondedMethod == GayBerneForce::NoCutoff) {
cu.executeKernel(forceKernel, &forceArgs[0], cu.getNonbondedUtilities().getNumForceThreadBlocks()*cu.getNonbondedUtilities().getForceThreadBlockSize());
}
else {
while (true) {
cu.executeKernel(neighborsKernel, &neighborsArgs[0], numRealParticles);
int* count = (int*) cu.getPinnedBuffer();
neighborBlockCount->download(count, false);
cu.executeKernel(forceKernel, &forceArgs[0], cu.getNonbondedUtilities().getNumForceThreadBlocks()*cu.getNonbondedUtilities().getForceThreadBlockSize());
CHECK_RESULT(cuEventSynchronize(event), "Error synchronizing on event for GayBerneForce");
if (*count <= maxNeighborBlocks)
break;
// There wasn't enough room for the neighbor list, so we need to recreate it.
delete neighbors;
neighbors = NULL;
delete neighborIndex;
neighborIndex = NULL;
maxNeighborBlocks = (int) ceil((*count)*1.1);
neighbors = CudaArray::create<int>(cu, maxNeighborBlocks*32, "neighbors");
neighborIndex = CudaArray::create<int>(cu, maxNeighborBlocks, "neighbors");
neighborsArgs[10] = &neighbors->getDevicePointer();
neighborsArgs[11] = &neighborIndex->getDevicePointer();
forceArgs[17] = &neighbors->getDevicePointer();
forceArgs[18] = &neighborIndex->getDevicePointer();
}
}
cu.executeKernel(torqueKernel, &torqueArgs[0], numRealParticles);
return 0.0;
}
void CudaCalcGayBerneForceKernel::copyParametersToContext(ContextImpl& context, const GayBerneForce& force) {
// Make sure the new parameters are acceptable.
if (force.getNumParticles() != cu.getNumAtoms())
throw OpenMMException("updateParametersInContext: The number of particles has changed");
vector<int> exceptions;
for (int i = 0; i < force.getNumExceptions(); i++) {
int particle1, particle2;
double sigma, epsilon;
force.getExceptionParameters(i, particle1, particle2, sigma, epsilon);
if (exceptionAtoms.size() > exceptions.size() && make_pair(particle1, particle2) == exceptionAtoms[exceptions.size()])
exceptions.push_back(i);
else if (epsilon != 0.0)
throw OpenMMException("updateParametersInContext: The set of non-excluded exceptions has changed");
}
int numExceptions = exceptionAtoms.size();
// Record the per-particle parameters.
vector<float4> sigParamsVector(cu.getPaddedNumAtoms(), make_float4(0, 0, 0, 0));
vector<float2> epsParamsVector(cu.getPaddedNumAtoms(), make_float2(0, 0));
vector<float4> scaleVector(cu.getPaddedNumAtoms(), make_float4(0, 0, 0, 0));
for (int i = 0; i < force.getNumParticles(); i++) {
int xparticle, yparticle;
double sigma, epsilon, sx, sy, sz, ex, ey, ez;
force.getParticleParameters(i, sigma, epsilon, xparticle, yparticle, sx, sy, sz, ex, ey, ez);
sigParamsVector[i] = make_float4((float) (0.5*sigma), (float) (0.25*sx*sx), (float) (0.25*sy*sy), (float) (0.25*sz*sz));
epsParamsVector[i] = make_float2((float) sqrt(epsilon), (float) (0.125*(sx*sy + sz*sz)*sqrt(sx*sy)));
scaleVector[i] = make_float4((float) (1/sqrt(ex)), (float) (1/sqrt(ey)), (float) (1/sqrt(ez)), 0);
if (epsilon != 0.0 && !isRealParticle[i])
throw OpenMMException("updateParametersInContext: The set of ignored particles (ones with epsilon=0) has changed");
}
sigParams->upload(sigParamsVector);
epsParams->upload(epsParamsVector);
scale->upload(scaleVector);
// Record the exceptions.
if (numExceptions > 0) {
vector<float2> exceptionParamsVec(numExceptions);
for (int i = 0; i < numExceptions; i++) {
int atom1, atom2;
double sigma, epsilon;
force.getExceptionParameters(exceptions[i], atom1, atom2, sigma, epsilon);
exceptionParamsVec[i] = make_float2((float) sigma, (float) epsilon);
}
exceptionParams->upload(exceptionParamsVec);
}
cu.invalidateMolecules();
sortAtoms();
}
void CudaCalcGayBerneForceKernel::sortAtoms() {
// Sort the list of atoms by type to avoid thread divergence. This is executed every time
// the atoms are reordered.
int nextIndex = 0;
vector<int> particles(cu.getPaddedNumAtoms(), 0);
const vector<int>& order = cu.getAtomIndex();
vector<int> inverseOrder(order.size(), -1);
for (int i = 0; i < cu.getNumAtoms(); i++) {
int atom = order[i];
if (isRealParticle[atom]) {
inverseOrder[atom] = nextIndex;
particles[nextIndex++] = atom;
}
}
sortedParticles->upload(particles);
// Update the list of exception particles.
int numExceptions = exceptionAtoms.size();
if (numExceptions > 0) {
vector<int4> exceptionParticlesVec(numExceptions);
for (int i = 0; i < numExceptions; i++)
exceptionParticlesVec[i] = make_int4(exceptionAtoms[i].first, exceptionAtoms[i].second, inverseOrder[exceptionAtoms[i].first], inverseOrder[exceptionAtoms[i].second]);
exceptionParticles->upload(exceptionParticlesVec);
}
// Rebuild the list of exclusions.
vector<vector<int> > excludedAtoms(numRealParticles);
for (int i = 0; i < excludedPairs.size(); i++) {
int first = inverseOrder[min(excludedPairs[i].first, excludedPairs[i].second)];
int second = inverseOrder[max(excludedPairs[i].first, excludedPairs[i].second)];
excludedAtoms[first].push_back(second);
}
int index = 0;
vector<int> exclusionVec(exclusions->getSize());
vector<int> startIndexVec(exclusionStartIndex->getSize());
for (int i = 0; i < numRealParticles; i++) {
startIndexVec[i] = index;
for (int j = 0; j < excludedAtoms[i].size(); j++)
exclusionVec[index++] = excludedAtoms[i][j];
}
startIndexVec[numRealParticles] = index;
exclusions->upload(exclusionVec);
exclusionStartIndex->upload(startIndexVec);
}
CudaIntegrateVerletStepKernel::~CudaIntegrateVerletStepKernel() { CudaIntegrateVerletStepKernel::~CudaIntegrateVerletStepKernel() {
} }
......
...@@ -92,6 +92,7 @@ CudaPlatform::CudaPlatform() { ...@@ -92,6 +92,7 @@ CudaPlatform::CudaPlatform() {
registerKernelFactory(CalcCustomCentroidBondForceKernel::Name(), factory); registerKernelFactory(CalcCustomCentroidBondForceKernel::Name(), factory);
registerKernelFactory(CalcCustomCompoundBondForceKernel::Name(), factory); registerKernelFactory(CalcCustomCompoundBondForceKernel::Name(), factory);
registerKernelFactory(CalcCustomManyParticleForceKernel::Name(), factory); registerKernelFactory(CalcCustomManyParticleForceKernel::Name(), factory);
registerKernelFactory(CalcGayBerneForceKernel::Name(), factory);
registerKernelFactory(IntegrateVerletStepKernel::Name(), factory); registerKernelFactory(IntegrateVerletStepKernel::Name(), factory);
registerKernelFactory(IntegrateLangevinStepKernel::Name(), factory); registerKernelFactory(IntegrateLangevinStepKernel::Name(), factory);
registerKernelFactory(IntegrateBrownianStepKernel::Name(), factory); registerKernelFactory(IntegrateBrownianStepKernel::Name(), factory);
......
#define TILE_SIZE 32
#define NEIGHBOR_BLOCK_SIZE 32
/**
* Calculate the ellipsoid coordinate frames and associated matrices.
*/
extern "C" __global__ void computeEllipsoidFrames(int numParticles, const real4* __restrict__ posq, int2* const __restrict__ axisParticleIndices,
const float4* __restrict__ sigParams, const float4* __restrict__ scale, real* __restrict__ aMatrix,
real* __restrict__ bMatrix, real* __restrict__ gMatrix, const int* sortedParticles) {
for (int sortedIndex = blockIdx.x*blockDim.x+threadIdx.x; sortedIndex < numParticles; sortedIndex += blockDim.x*gridDim.x) {
// Compute the local coordinate system of the ellipsoid;
int originalIndex = sortedParticles[sortedIndex];
real3 pos = trimTo3(posq[originalIndex]);
int2 axisParticles = axisParticleIndices[originalIndex];
real3 xdir, ydir, zdir;
if (axisParticles.x == -1) {
xdir = make_real3(1, 0, 0);
ydir = make_real3(0, 1, 0);
}
else {
xdir = pos-trimTo3(posq[axisParticles.x]);
xdir = normalize(xdir);
if (axisParticles.y == -1) {
if (xdir.y > -0.5f && xdir.y < 0.5f)
ydir = make_real3(0, 1, 0);
else
ydir = make_real3(1, 0, 0);
}
else
ydir = pos-trimTo3(posq[axisParticles.y]);
ydir -= xdir*dot(xdir, ydir);
ydir = normalize(ydir);
}
zdir = cross(xdir, ydir);
// Compute matrices we will need later.
real (*a)[3] = (real (*)[3]) (aMatrix+sortedIndex*9);
real (*b)[3] = (real (*)[3]) (bMatrix+sortedIndex*9);
real (*g)[3] = (real (*)[3]) (gMatrix+sortedIndex*9);
a[0][0] = xdir.x;
a[0][1] = xdir.y;
a[0][2] = xdir.z;
a[1][0] = ydir.x;
a[1][1] = ydir.y;
a[1][2] = ydir.z;
a[2][0] = zdir.x;
a[2][1] = zdir.y;
a[2][2] = zdir.z;
float4 sig = sigParams[originalIndex];
float3 r2 = sig.yzw;
float3 e2 = trimTo3(scale[originalIndex]);
for (int i = 0; i < 3; i++)
for (int j = 0; j < 3; j++) {
b[i][j] = a[0][i]*e2.x*a[0][j] + a[1][i]*e2.y*a[1][j] + a[2][i]*e2.z*a[2][j];
g[i][j] = a[0][i]*r2.x*a[0][j] + a[1][i]*r2.y*a[1][j] + a[2][i]*r2.z*a[2][j];
}
}
}
/**
* Find a bounding box for the atoms in each block.
*/
extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
const int* sortedAtoms, const real4* __restrict__ posq, real4* __restrict__ sortedPos, real4* __restrict__ blockCenter,
real4* __restrict__ blockBoundingBox, int* __restrict__ neighborBlockCount) {
int index = blockIdx.x*blockDim.x+threadIdx.x;
int base = index*TILE_SIZE;
while (base < numAtoms) {
real4 pos = posq[sortedAtoms[base]];
sortedPos[base] = pos;
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_POS(pos)
#endif
real4 minPos = pos;
real4 maxPos = pos;
int last = min(base+TILE_SIZE, numAtoms);
for (int i = base+1; i < last; i++) {
pos = posq[sortedAtoms[i]];
sortedPos[i] = pos;
#ifdef USE_PERIODIC
real4 center = 0.5f*(maxPos+minPos);
APPLY_PERIODIC_TO_POS_WITH_CENTER(pos, center)
#endif
minPos = min(minPos, pos);
maxPos = max(maxPos, pos);
}
real4 blockSize = 0.5f*(maxPos-minPos);
blockBoundingBox[index] = blockSize;
blockCenter[index] = 0.5f*(maxPos+minPos);
index += blockDim.x*gridDim.x;
base = index*TILE_SIZE;
}
if (blockIdx.x*blockDim.x+threadIdx.x == 0)
*neighborBlockCount = 0;
}
/**
* This is called by findNeighbors() to write a block to the neighbor list.
*/
void storeNeighbors(int atom1, int* neighborBuffer, int numAtomsInBuffer, int maxNeighborBlocks, int* __restrict__ neighbors,
int* __restrict__ neighborIndex, int* __restrict__ neighborBlockCount) {
int blockIndex = atomicAdd(neighborBlockCount, 1);
if (blockIndex >= maxNeighborBlocks)
return; // We don't have enough room for the neighbor list.
neighborIndex[blockIndex] = atom1;
int baseIndex = blockIndex*NEIGHBOR_BLOCK_SIZE;
for (int i = 0; i < numAtomsInBuffer; i++)
neighbors[baseIndex+i] = neighborBuffer[i];
for (int i = numAtomsInBuffer; i < NEIGHBOR_BLOCK_SIZE; i++)
neighbors[baseIndex+i] = -1;
}
/**
* Build a list of neighbors for each atom.
*/
extern "C" __global__ void findNeighbors(int numAtoms, int maxNeighborBlocks, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
real4* __restrict__ sortedPos, real4* __restrict__ blockCenter, real4* __restrict__ blockBoundingBox, int* __restrict__ neighbors,
int* __restrict__ neighborIndex, int* __restrict__ neighborBlockCount, const int* __restrict__ exclusions, const int* __restrict__ exclusionStartIndex) {
const int numBlocks = (numAtoms+TILE_SIZE-1)/TILE_SIZE;
int neighborBuffer[NEIGHBOR_BLOCK_SIZE];
for (int atom1 = blockIdx.x*blockDim.x+threadIdx.x; atom1 < numAtoms; atom1 += blockDim.x*gridDim.x) {
int nextExclusion = exclusionStartIndex[atom1];
int lastExclusion = exclusionStartIndex[atom1+1];
real4 pos = sortedPos[atom1];
int nextBufferIndex = 0;
// Loop over atom blocks and compute the distance of this atom from each one's bounding box.
for (int block = (atom1+1)/TILE_SIZE; block < numBlocks; block++) {
real4 center = blockCenter[block];
real4 blockSize = blockBoundingBox[block];
real4 blockDelta = center-pos;
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(blockDelta)
#endif
blockDelta.x = max((real) 0, fabs(blockDelta.x)-blockSize.x);
blockDelta.y = max((real) 0, fabs(blockDelta.y)-blockSize.y);
blockDelta.z = max((real) 0, fabs(blockDelta.z)-blockSize.z);
if (blockDelta.x*blockDelta.x+blockDelta.y*blockDelta.y+blockDelta.z*blockDelta.z >= CUTOFF_SQUARED)
continue;
// Loop over atoms within this block.
int first = max(block*TILE_SIZE, atom1+1);
int last = min((block+1)*TILE_SIZE, numAtoms);
for (int atom2 = first; atom2 < last; atom2++) {
// Skip over excluded interactions.
if (nextExclusion < lastExclusion && exclusions[nextExclusion] >= atom2) {
nextExclusion++;
continue;
}
real4 delta = pos-sortedPos[atom2];
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta)
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
if (r2 < CUTOFF_SQUARED) {
neighborBuffer[nextBufferIndex++] = atom2;
if (nextBufferIndex == NEIGHBOR_BLOCK_SIZE) {
storeNeighbors(atom1, neighborBuffer, nextBufferIndex, maxNeighborBlocks, neighbors, neighborIndex, neighborBlockCount);
nextBufferIndex = 0;
}
}
}
}
if (nextBufferIndex > 0)
storeNeighbors(atom1, neighborBuffer, nextBufferIndex, maxNeighborBlocks, neighbors, neighborIndex, neighborBlockCount);
}
}
typedef struct {
float4 sig;
float2 eps;
real3 pos;
real a[3][3], b[3][3], g[3][3];
} AtomData;
void loadAtomData(AtomData* data, int sortedIndex, int originalIndex, const real4* __restrict__ pos, const float4* __restrict__ sigParams,
const float2* __restrict__ epsParams, const real* __restrict__ aMatrix, const real* __restrict__ bMatrix, const real* __restrict__ gMatrix) {
data->sig = sigParams[originalIndex];
data->eps = epsParams[originalIndex];
data->pos = trimTo3(pos[sortedIndex]);
for (int i = 0; i < 3; i++)
for (int j = 0; j < 3; j++) {
int k = 9*sortedIndex+3*i+j;
data->a[i][j] = aMatrix[k];
data->b[i][j] = bMatrix[k];
data->g[i][j] = gMatrix[k];
}
}
real3 matrixVectorProduct(real (*m)[3], real3 v) {
return make_real3(m[0][0]*v.x + m[0][1]*v.y + m[0][2]*v.z,
m[1][0]*v.x + m[1][1]*v.y + m[1][2]*v.z,
m[2][0]*v.x + m[2][1]*v.y + m[2][2]*v.z);
}
real3 vectorMatrixProduct(real3 v, real (*m)[3]) {
return make_real3(m[0][0]*v.x + m[1][0]*v.y + m[2][0]*v.z,
m[0][1]*v.x + m[1][1]*v.y + m[2][1]*v.z,
m[0][2]*v.x + m[1][2]*v.y + m[2][2]*v.z);
}
void matrixSum(real (*result)[3], real (*a)[3], real (*b)[3]) {
result[0][0] = a[0][0]+b[0][0];
result[0][1] = a[0][1]+b[0][1];
result[0][2] = a[0][2]+b[0][2];
result[1][0] = a[1][0]+b[1][0];
result[1][1] = a[1][1]+b[1][1];
result[1][2] = a[1][2]+b[1][2];
result[2][0] = a[2][0]+b[2][0];
result[2][1] = a[2][1]+b[2][1];
result[2][2] = a[2][2]+b[2][2];
}
real determinant(real (*m)[3]) {
return (m[0][0]*m[1][1]*m[2][2] + m[0][1]*m[1][2]*m[2][0] + m[0][2]*m[1][0]*m[2][1] -
m[0][0]*m[1][2]*m[2][1] - m[0][1]*m[1][0]*m[2][2] - m[0][2]*m[1][1]*m[2][0]);
}
void matrixInverse(real (*result)[3], real (*m)[3]) {
real invDet = RECIP(determinant(m));
result[0][0] = invDet*(m[1][1]*m[2][2] - m[1][2]*m[2][1]);
result[1][0] = -invDet*(m[1][0]*m[2][2] - m[1][2]*m[2][0]);
result[2][0] = invDet*(m[1][0]*m[2][1] - m[1][1]*m[2][0]);
result[0][1] = -invDet*(m[0][1]*m[2][2] - m[0][2]*m[2][1]);
result[1][1] = invDet*(m[0][0]*m[2][2] - m[0][2]*m[2][0]);
result[2][1] = -invDet*(m[0][0]*m[2][1] - m[0][1]*m[2][0]);
result[0][2] = invDet*(m[0][1]*m[1][2] - m[0][2]*m[1][1]);
result[1][2] = -invDet*(m[0][0]*m[1][2] - m[0][2]*m[1][0]);
result[2][2] = invDet*(m[0][0]*m[1][1] - m[0][1]*m[1][0]);
}
void computeOneInteraction(AtomData* data1, AtomData* data2, real sigma, real epsilon, real3 dr, real r2, real3* force1, real3* force2, real3* torque1, real3* torque2, real *totalEnergy) {
real rInv = RSQRT(r2);
real r = r2*rInv;
real3 drUnit = dr*rInv;
// Compute the switching function.
real switchValue = 1, switchDeriv = 0;
#if USE_SWITCH
if (r > SWITCH_CUTOFF) {
real x = r-SWITCH_CUTOFF;
switchValue = 1+x*x*x*(SWITCH_C3+x*(SWITCH_C4+x*SWITCH_C5));
switchDeriv = x*x*(3*SWITCH_C3+x*(4*SWITCH_C4+x*5*SWITCH_C5));
}
#endif
// Compute vectors and matrices we'll be needing.
real B12[3][3], G12[3][3], B12inv[3][3], G12inv[3][3];
matrixSum(B12, data1->b, data2->b);
matrixSum(G12, data1->g, data2->g);
matrixInverse(B12inv, B12);
matrixInverse(G12inv, G12);
real detG12 = determinant(G12);
// Estimate the distance between the ellipsoids and compute the first terms needed for the energy.
real sigma12 = 1/SQRT(0.5f*dot(drUnit, matrixVectorProduct(G12inv, drUnit)));
real h12 = r - sigma12;
real rho = sigma/(h12+sigma);
real rho2 = rho*rho;
real rho6 = rho2*rho2*rho2;
real u = 4*epsilon*(rho6*rho6-rho6);
real eta = SQRT(2*data1->eps.y*data2->eps.y/detG12);
real chi = 2*dot(drUnit, matrixVectorProduct(B12inv, drUnit));
chi *= chi;
real energy = u*eta*chi;
// Compute the terms needed for the force.
real3 kappa = matrixVectorProduct(G12inv, dr);
real3 iota = matrixVectorProduct(B12inv, dr);
real rInv2 = rInv*rInv;
real dUSLJdr = 24*epsilon*(2*rho6-1)*rho6*rho/sigma;
real temp = 0.5f*sigma12*sigma12*sigma12*rInv2;
real3 dudr = (drUnit + (kappa-drUnit*dot(kappa, drUnit))*temp)*dUSLJdr;
real3 dchidr = (iota-drUnit*dot(iota, drUnit))*(-8*rInv2*SQRT(chi));
real3 force = (dchidr*u + dudr*chi)*(eta*switchValue) - drUnit*(energy*switchDeriv);
*force1 += force;
*force2 -= force;
// Compute the terms needed for the torque.
for (int j = 0; j < 2; j++) {
real (*a)[3] = (j == 0 ? data1->a : data2->a);
real (*b)[3] = (j == 0 ? data1->b : data2->b);
real (*g)[3] = (j == 0 ? data1->g : data2->g);
float4 sig = (j == 0 ? data1->sig : data2->sig);
real3 dudq = cross(vectorMatrixProduct(kappa, g), kappa*(temp*dUSLJdr));
real3 dchidq = cross(vectorMatrixProduct(iota, b), iota)*(-4*rInv2);
real3 scale = make_real3(sig.y, sig.z, sig.w)*(-0.5f*eta/detG12);
real d[3][3];
d[0][0] = scale.x*(2*a[0][0]*(G12[1][1]*G12[2][2] - G12[1][2]*G12[2][1]) +
a[0][2]*(G12[1][2]*G12[0][1] + G12[1][0]*G12[2][1] - G12[1][1]*(G12[0][2] + G12[2][0])) +
a[0][1]*(G12[0][2]*G12[2][1] + G12[2][0]*G12[1][2] - G12[2][2]*(G12[0][1] + G12[1][0])));
d[0][1] = scale.x*( a[0][0]*(G12[0][2]*G12[2][1] + G12[2][0]*G12[1][2] - G12[2][2]*(G12[0][1] + G12[1][0])) +
2*a[0][1]*(G12[0][0]*G12[2][2] - G12[2][0]*G12[0][2]) +
a[0][2]*(G12[1][0]*G12[0][2] + G12[2][0]*G12[0][1] - G12[0][0]*(G12[1][2] + G12[2][1])));
d[0][2] = scale.x*( a[0][0]*(G12[0][1]*G12[1][2] + G12[1][0]*G12[2][1] - G12[1][1]*(G12[0][2] + G12[2][0])) +
a[0][1]*(G12[1][0]*G12[0][2] + G12[2][0]*G12[0][1] - G12[0][0]*(G12[1][2] + G12[2][1])) +
2*a[0][2]*(G12[1][1]*G12[0][0] - G12[1][0]*G12[0][1]));
d[1][0] = scale.y*(2*a[1][0]*(G12[1][1]*G12[2][2] - G12[1][2]*G12[2][1]) +
a[1][1]*(G12[0][2]*G12[2][1] + G12[2][0]*G12[1][2] - G12[2][2]*(G12[0][1] + G12[1][0])) +
a[1][2]*(G12[1][2]*G12[0][1] + G12[1][0]*G12[2][1] - G12[1][1]*(G12[0][2] + G12[2][0])));
d[1][1] = scale.y*( a[1][0]*(G12[0][2]*G12[2][1] + G12[2][0]*G12[1][2] - G12[2][2]*(G12[0][1] + G12[1][0])) +
2*a[1][1]*(G12[2][2]*G12[0][0] - G12[2][0]*G12[0][2]) +
a[1][2]*(G12[1][0]*G12[0][2] + G12[0][1]*G12[2][0] - G12[0][0]*(G12[1][2] + G12[2][1])));
d[1][2] = scale.y*( a[1][0]*(G12[0][1]*G12[1][2] + G12[1][0]*G12[2][1] - G12[1][1]*(G12[0][2] + G12[2][0])) +
a[1][1]*(G12[1][0]*G12[0][2] + G12[0][1]*G12[2][0] - G12[0][0]*(G12[1][2] + G12[2][1])) +
2*a[1][2]*(G12[1][1]*G12[0][0] - G12[1][0]*G12[0][1]));
d[2][0] = scale.z*(2*a[2][0]*(G12[1][1]*G12[2][2] - G12[2][1]*G12[1][2]) +
a[2][1]*(G12[0][2]*G12[2][1] + G12[1][2]*G12[2][0] - G12[2][2]*(G12[0][1] + G12[1][0])) +
a[2][2]*(G12[0][1]*G12[1][2] + G12[2][1]*G12[1][0] - G12[1][1]*(G12[0][2] + G12[2][0])));
d[2][1] = scale.z*( a[2][0]*(G12[0][2]*G12[2][1] + G12[1][2]*G12[2][0] - G12[2][2]*(G12[0][1] + G12[1][0])) +
2*a[2][1]*(G12[0][0]*G12[2][2] - G12[0][2]*G12[2][0]) +
a[2][2]*(G12[1][0]*G12[0][2] + G12[0][1]*G12[2][0] - G12[0][0]*(G12[1][2] + G12[2][1])));
d[2][2] = scale.z*( a[2][0]*(G12[0][1]*G12[1][2] + G12[2][1]*G12[1][0] - G12[1][1]*(G12[0][2] + G12[2][0])) +
a[2][1]*(G12[1][0]*G12[0][2] + G12[2][0]*G12[0][1] - G12[0][0]*(G12[1][2] + G12[2][1])) +
2*a[2][2]*(G12[1][1]*G12[0][0] - G12[1][0]*G12[0][1]));
real3 detadq = 0;
for (int i = 0; i < 3; i++)
detadq += cross(make_real3(a[i][0], a[i][1], a[i][2]), make_real3(d[i][0], d[i][1], d[i][2]));
real3 torque = (dchidq*(u*eta) + detadq*(u*chi) + dudq*(eta*chi))*switchValue;
*(j == 0 ? torque1 : torque2) -= torque;
}
*totalEnergy += switchValue*energy;
}
/**
* Compute the interactions.
*/
extern "C" __global__ void computeForce(
long* __restrict__ forceBuffers, long* __restrict__ torqueBuffers,
int numAtoms, int numExceptions, mixed* __restrict__ energyBuffer, const real4* __restrict__ pos,
const float4* __restrict__ sigParams, const float2* __restrict__ epsParams, const int* __restrict__ sortedAtoms,
const real* __restrict__ aMatrix, const real* __restrict__ bMatrix, const real* __restrict__ gMatrix,
const int* __restrict__ exclusions, const int* __restrict__ exclusionStartIndex,
const int4* __restrict__ exceptionParticles, const float2* __restrict__ exceptionParams
#ifdef USE_CUTOFF
, int maxNeighborBlocks, int* __restrict__ neighbors, int* __restrict__ neighborIndex, int* __restrict__ neighborBlockCount,
real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
#endif
) {
const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
mixed energy = 0;
#ifdef USE_CUTOFF
const int numBlocks = *neighborBlockCount;
if (numBlocks > maxNeighborBlocks)
return; // There wasn't enough memory for the neighbor list.
for (int block = blockIdx.x*blockDim.x+threadIdx.x; block < numBlocks; block += blockDim.x*gridDim.x) {
// Load parameters for atom1.
int atom1 = neighborIndex[block];
int index1 = sortedAtoms[atom1];
AtomData data1;
loadAtomData(&data1, atom1, index1, pos, sigParams, epsParams, aMatrix, bMatrix, gMatrix);
real3 force1 = 0.0f;
real3 torque1 = 0.0f;
for (int indexInBlock = 0; indexInBlock < NEIGHBOR_BLOCK_SIZE; indexInBlock++) {
// Load parameters for atom2.
int atom2 = neighbors[NEIGHBOR_BLOCK_SIZE*block+indexInBlock];
if (atom2 == -1)
continue;
int index2 = sortedAtoms[atom2];
AtomData data2;
loadAtomData(&data2, atom2, index2, pos, sigParams, epsParams, aMatrix, bMatrix, gMatrix);
real3 force2 = 0.0f;
real3 torque2 = 0.0f;
// Compute the interaction.
real3 delta = data1.pos-data2.pos;
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta)
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
real sigma = data1.sig.x+data2.sig.x;
real epsilon = data1.eps.x*data2.eps.x;
computeOneInteraction(&data1, &data2, sigma, epsilon, delta, r2, &force1, &force2, &torque1, &torque2, &energy);
atomicAdd(&forceBuffers[index2], static_cast<unsigned long long>((long long) (force2.x*0x100000000)));
atomicAdd(&forceBuffers[index2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force2.y*0x100000000)));
atomicAdd(&forceBuffers[index2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force2.z*0x100000000)));
atomicAdd(&torqueBuffers[index2], static_cast<unsigned long long>((long long) (torque2.x*0x100000000)));
atomicAdd(&torqueBuffers[index2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (torque2.y*0x100000000)));
atomicAdd(&torqueBuffers[index2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (torque2.z*0x100000000)));
}
atomicAdd(&forceBuffers[index1], static_cast<unsigned long long>((long long) (force1.x*0x100000000)));
atomicAdd(&forceBuffers[index1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force1.y*0x100000000)));
atomicAdd(&forceBuffers[index1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force1.z*0x100000000)));
atomicAdd(&torqueBuffers[index1], static_cast<unsigned long long>((long long) (torque1.x*0x100000000)));
atomicAdd(&torqueBuffers[index1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (torque1.y*0x100000000)));
atomicAdd(&torqueBuffers[index1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (torque1.z*0x100000000)));
}
#else
for (int atom1 = blockIdx.x*blockDim.x+threadIdx.x; atom1 < numAtoms; atom1 += blockDim.x*gridDim.x) {
// Load parameters for atom1.
int index1 = sortedAtoms[atom1];
AtomData data1;
loadAtomData(&data1, atom1, index1, pos, sigParams, epsParams, aMatrix, bMatrix, gMatrix);
real3 force1 = 0.0f;
real3 torque1 = 0.0f;
int nextExclusion = exclusionStartIndex[atom1];
int lastExclusion = exclusionStartIndex[atom1+1];
for (int atom2 = atom1+1; atom2 < numAtoms; atom2++) {
// Skip over excluded interactions.
if (nextExclusion < lastExclusion && exclusions[nextExclusion] == atom2) {
nextExclusion++;
continue;
}
// Load parameters for atom2.
int index2 = sortedAtoms[atom2];
AtomData data2;
loadAtomData(&data2, atom2, index2, pos, sigParams, epsParams, aMatrix, bMatrix, gMatrix);
real3 force2 = 0.0f;
real3 torque2 = 0.0f;
// Compute the interaction.
real3 delta = data1.pos-data2.pos;
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
real sigma = data1.sig.x+data2.sig.x;
real epsilon = data1.eps.x*data2.eps.x;
computeOneInteraction(&data1, &data2, sigma, epsilon, delta, r2, &force1, &force2, &torque1, &torque2, &energy);
atomicAdd(&forceBuffers[index2], static_cast<unsigned long long>((long long) (force2.x*0x100000000)));
atomicAdd(&forceBuffers[index2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force2.y*0x100000000)));
atomicAdd(&forceBuffers[index2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force2.z*0x100000000)));
atomicAdd(&torqueBuffers[index2], static_cast<unsigned long long>((long long) (torque2.x*0x100000000)));
atomicAdd(&torqueBuffers[index2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (torque2.y*0x100000000)));
atomicAdd(&torqueBuffers[index2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (torque2.z*0x100000000)));
}
atomicAdd(&forceBuffers[index1], static_cast<unsigned long long>((long long) (force1.x*0x100000000)));
atomicAdd(&forceBuffers[index1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force1.y*0x100000000)));
atomicAdd(&forceBuffers[index1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force1.z*0x100000000)));
atomicAdd(&torqueBuffers[index1], static_cast<unsigned long long>((long long) (torque1.x*0x100000000)));
atomicAdd(&torqueBuffers[index1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (torque1.y*0x100000000)));
atomicAdd(&torqueBuffers[index1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (torque1.z*0x100000000)));
}
#endif
// Now compute exceptions.
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numExceptions; index += blockDim.x*gridDim.x) {
int4 atomIndices = exceptionParticles[index];
float2 params = exceptionParams[index];
int index1 = atomIndices.x, index2 = atomIndices.y;
int atom1 = atomIndices.z, atom2 = atomIndices.w;
AtomData data1, data2;
loadAtomData(&data1, atom1, index1, pos, sigParams, epsParams, aMatrix, bMatrix, gMatrix);
loadAtomData(&data2, atom2, index2, pos, sigParams, epsParams, aMatrix, bMatrix, gMatrix);
real3 force1 = 0, force2 = 0;
real3 torque1 = 0, torque2 = 0;
real3 delta = data1.pos-data2.pos;
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
computeOneInteraction(&data1, &data2, params.x, params.y, delta, r2, &force1, &force2, &torque1, &torque2, &energy);
atomicAdd(&forceBuffers[index1], static_cast<unsigned long long>((long long) (force1.x*0x100000000)));
atomicAdd(&forceBuffers[index1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force1.y*0x100000000)));
atomicAdd(&forceBuffers[index1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force1.z*0x100000000)));
atomicAdd(&forceBuffers[index2], static_cast<unsigned long long>((long long) (force2.x*0x100000000)));
atomicAdd(&forceBuffers[index2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force2.y*0x100000000)));
atomicAdd(&forceBuffers[index2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force2.z*0x100000000)));
atomicAdd(&torqueBuffers[index1], static_cast<unsigned long long>((long long) (torque1.x*0x100000000)));
atomicAdd(&torqueBuffers[index1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (torque1.y*0x100000000)));
atomicAdd(&torqueBuffers[index1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (torque1.z*0x100000000)));
atomicAdd(&torqueBuffers[index2], static_cast<unsigned long long>((long long) (torque2.x*0x100000000)));
atomicAdd(&torqueBuffers[index2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (torque2.y*0x100000000)));
atomicAdd(&torqueBuffers[index2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (torque2.z*0x100000000)));
#ifdef USE_CUTOFF
}
#endif
}
energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
}
/**
* Convert the torques to forces on the connected particles.
*/
extern "C" __global__ void applyTorques(
long* __restrict__ forceBuffers, long* __restrict__ torqueBuffers,
int numParticles, const real4* __restrict__ posq, int2* const __restrict__ axisParticleIndices,
const int* sortedParticles) {
const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
for (int sortedIndex = blockIdx.x*blockDim.x+threadIdx.x; sortedIndex < numParticles; sortedIndex += blockDim.x*gridDim.x) {
int originalIndex = sortedParticles[sortedIndex];
real3 pos = trimTo3(posq[originalIndex]);
int2 axisParticles = axisParticleIndices[originalIndex];
if (axisParticles.x != -1) {
// Load the torque.
real scale = 1/(real) 0x100000000;
real3 torque = make_real3(scale*torqueBuffers[originalIndex], scale*torqueBuffers[originalIndex+PADDED_NUM_ATOMS], scale*torqueBuffers[originalIndex+2*PADDED_NUM_ATOMS]);
real3 force = 0, xforce = 0, yforce = 0;
// Apply a force to the x particle.
real3 dx = trimTo3(posq[axisParticles.x])-pos;
real dx2 = dot(dx, dx);
real3 f = cross(torque, dx)/dx2;
xforce += f;
force -= f;
if (axisParticles.y != -1) {
// Apply a force to the y particle. This is based on the component of the torque
// that was not already applied to the x particle.
real3 dy = trimTo3(posq[axisParticles.y])-pos;
real dy2 = dot(dy, dy);
real3 torque2 = dx*dot(torque, dx)/dx2;
f = cross(torque2, dy)/dy2;
yforce += f;
force -= f;
}
atomicAdd(&forceBuffers[originalIndex], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
atomicAdd(&forceBuffers[originalIndex+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
atomicAdd(&forceBuffers[originalIndex+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
atomicAdd(&forceBuffers[axisParticles.x], static_cast<unsigned long long>((long long) (xforce.x*0x100000000)));
atomicAdd(&forceBuffers[axisParticles.x+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (xforce.y*0x100000000)));
atomicAdd(&forceBuffers[axisParticles.x+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (xforce.z*0x100000000)));
if (axisParticles.y != -1) {
atomicAdd(&forceBuffers[axisParticles.y], static_cast<unsigned long long>((long long) (yforce.x*0x100000000)));
atomicAdd(&forceBuffers[axisParticles.y+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (yforce.y*0x100000000)));
atomicAdd(&forceBuffers[axisParticles.y+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (yforce.z*0x100000000)));
}
}
}
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2016 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include "CudaTests.h"
#include "TestGayBerneForce.h"
void runPlatformTests() {
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment