Unverified Commit b4c54303 authored by Peter Eastman's avatar Peter Eastman Committed by GitHub
Browse files

Faster implementation of CustomHbondForce (#4060)

* Faster implementation of CustomHbondForce

* Minor optimization

* Optimized writing forces, which are often zero

* Fix test failure on CPU OpenCL

* Bug fix
parent 2ae50f9d
...@@ -793,7 +793,7 @@ private: ...@@ -793,7 +793,7 @@ private:
std::vector<ComputeArray> tabulatedFunctionArrays; std::vector<ComputeArray> tabulatedFunctionArrays;
std::map<std::string, int> tabulatedFunctionUpdateCount; std::map<std::string, int> tabulatedFunctionUpdateCount;
const System& system; const System& system;
ComputeKernel donorKernel, acceptorKernel; ComputeKernel kernel;
}; };
/** /**
......
This diff is collapsed.
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
* Compute the difference between two vectors, optionally taking periodic boundary conditions into account * Compute the difference between two vectors, optionally taking periodic boundary conditions into account
* and setting the fourth component to the squared magnitude. * and setting the fourth component to the squared magnitude.
*/ */
inline DEVICE real4 delta(real4 vec1, real4 vec2, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ) { inline DEVICE real4 delta(real3 vec1, real3 vec2, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ) {
real4 result = make_real4(vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0); real4 result = make_real4(vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(result) APPLY_PERIODIC_TO_DELTA(result)
...@@ -41,184 +41,116 @@ inline DEVICE real4 computeCross(real4 vec1, real4 vec2) { ...@@ -41,184 +41,116 @@ inline DEVICE real4 computeCross(real4 vec1, real4 vec2) {
} }
/** /**
* Compute forces on donors. * Write the force on an atom to global memory.
*/ */
KERNEL void computeDonorForces( inline DEVICE void applyForce(int atom, real3 f, GLOBAL mm_ulong* force) {
if (atom > -1) {
if (f.x != 0)
ATOMIC_ADD(&force[atom], (mm_ulong) realToFixedPoint(f.x));
if (f.y != 0)
ATOMIC_ADD(&force[atom+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(f.y));
if (f.z != 0)
ATOMIC_ADD(&force[atom+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(f.z));
MEM_FENCE;
}
}
typedef struct {
real3 pos1, pos2, pos3;
real3 f1, f2, f3;
} AcceptorData;
/**
* Compute forces on donors and acceptors.
*/
KERNEL void computeHbondForces(
GLOBAL mm_ulong* RESTRICT force, GLOBAL mm_ulong* RESTRICT force,
GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const real4* RESTRICT posq, GLOBAL const int4* RESTRICT exclusions, GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const real4* RESTRICT posq, GLOBAL const int4* RESTRICT exclusions,
GLOBAL const int4* RESTRICT donorAtoms, GLOBAL const int4* RESTRICT acceptorAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, GLOBAL const int4* RESTRICT donorAtoms, GLOBAL const int4* RESTRICT acceptorAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
PARAMETER_ARGUMENTS) { PARAMETER_ARGUMENTS) {
LOCAL real4 posBuffer[3*THREAD_BLOCK_SIZE]; const unsigned int totalWarps = GLOBAL_SIZE/32;
const unsigned int warp = GLOBAL_ID/32;
const int indexInWarp = GLOBAL_ID%32;
const int tbx = LOCAL_ID-indexInWarp;
LOCAL AcceptorData localData[THREAD_BLOCK_SIZE];
mixed energy = 0; mixed energy = 0;
for (int tile = warp; tile < NUM_DONOR_BLOCKS*NUM_ACCEPTOR_BLOCKS; tile += totalWarps) {
int donorStart = (tile/NUM_ACCEPTOR_BLOCKS)*32;
int acceptorStart = (tile%NUM_ACCEPTOR_BLOCKS)*32;
// Load information about the donor this thread will compute forces on.
real3 f1 = make_real3(0); real3 f1 = make_real3(0);
real3 f2 = make_real3(0); real3 f2 = make_real3(0);
real3 f3 = make_real3(0); real3 f3 = make_real3(0);
for (int donorStart = 0; donorStart < NUM_DONORS; donorStart += GLOBAL_SIZE) { int donorIndex = donorStart+indexInWarp;
// Load information about the donor this thread will compute forces on.
int donorIndex = donorStart+GLOBAL_ID;
int4 atoms, exclusionIndices; int4 atoms, exclusionIndices;
real4 d1, d2, d3; real3 d1, d2, d3;
if (donorIndex < NUM_DONORS) { if (donorIndex < NUM_DONORS) {
atoms = donorAtoms[donorIndex]; atoms = donorAtoms[donorIndex];
d1 = (atoms.x > -1 ? posq[atoms.x] : make_real4(0)); d1 = (atoms.x > -1 ? trimTo3(posq[atoms.x]) : make_real3(0));
d2 = (atoms.y > -1 ? posq[atoms.y] : make_real4(0)); d2 = (atoms.y > -1 ? trimTo3(posq[atoms.y]) : make_real3(0));
d3 = (atoms.z > -1 ? posq[atoms.z] : make_real4(0)); d3 = (atoms.z > -1 ? trimTo3(posq[atoms.z]) : make_real3(0));
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
exclusionIndices = exclusions[donorIndex]; exclusionIndices = exclusions[donorIndex];
#endif #endif
} }
else else
atoms = make_int4(-1, -1, -1, -1); atoms = make_int4(-1, -1, -1, -1);
for (int acceptorStart = 0; acceptorStart < NUM_ACCEPTORS; acceptorStart += LOCAL_SIZE) {
// Load the next block of acceptors into local memory.
SYNC_THREADS; // Load information about the acceptors into local memory.
int blockSize = min((int) LOCAL_SIZE, NUM_ACCEPTORS-acceptorStart);
if (LOCAL_ID < blockSize) { SYNC_WARPS;
int4 atoms2 = acceptorAtoms[acceptorStart+LOCAL_ID]; localData[LOCAL_ID].f1 = make_real3(0);
posBuffer[3*LOCAL_ID] = (atoms2.x > -1 ? posq[atoms2.x] : make_real4(0)); localData[LOCAL_ID].f2 = make_real3(0);
posBuffer[3*LOCAL_ID+1] = (atoms2.y > -1 ? posq[atoms2.y] : make_real4(0)); localData[LOCAL_ID].f3 = make_real3(0);
posBuffer[3*LOCAL_ID+2] = (atoms2.z > -1 ? posq[atoms2.z] : make_real4(0)); int blockSize = min(32, NUM_ACCEPTORS-acceptorStart);
} int4 atoms2 = (indexInWarp < blockSize ? acceptorAtoms[acceptorStart+indexInWarp] : make_int4(-1));
SYNC_THREADS; if (indexInWarp < blockSize) {
localData[LOCAL_ID].pos1 = (atoms2.x > -1 ? trimTo3(posq[atoms2.x]) : make_real3(0));
localData[LOCAL_ID].pos2 = (atoms2.y > -1 ? trimTo3(posq[atoms2.y]) : make_real3(0));
localData[LOCAL_ID].pos3 = (atoms2.z > -1 ? trimTo3(posq[atoms2.z]) : make_real3(0));
}
SYNC_WARPS;
if (donorIndex < NUM_DONORS) { if (donorIndex < NUM_DONORS) {
for (int index = 0; index < blockSize; index++) { int index = indexInWarp;
for (int j = 0; j < 32; j++) {
int acceptorIndex = acceptorStart+index; int acceptorIndex = acceptorStart+index;
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
if (acceptorIndex == exclusionIndices.x || acceptorIndex == exclusionIndices.y || acceptorIndex == exclusionIndices.z || acceptorIndex == exclusionIndices.w) if (acceptorIndex < NUM_ACCEPTORS && acceptorIndex != exclusionIndices.x && acceptorIndex != exclusionIndices.y && acceptorIndex != exclusionIndices.z && acceptorIndex != exclusionIndices.w) {
continue; #else
if (acceptorIndex < NUM_ACCEPTORS) {
#endif #endif
// Compute the interaction between a donor and an acceptor. // Compute the interaction between a donor and an acceptor.
real4 a1 = posBuffer[3*index]; real3 a1 = localData[tbx+index].pos1;
real4 a2 = posBuffer[3*index+1]; real3 a2 = localData[tbx+index].pos2;
real4 a3 = posBuffer[3*index+2]; real3 a3 = localData[tbx+index].pos3;
real4 deltaD1A1 = delta(d1, a1, periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ); real4 deltaD1A1 = delta(d1, a1, periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ);
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (deltaD1A1.w < CUTOFF_SQUARED) { if (deltaD1A1.w < CUTOFF_SQUARED) {
#endif #endif
COMPUTE_DONOR_FORCE COMPUTE_FORCE
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
} }
#endif #endif
} }
index = (index+1)%32;
} }
} }
// Write results // Write results
if (donorIndex < NUM_DONORS) { if (donorIndex < NUM_DONORS) {
if (atoms.x > -1) { applyForce(atoms.x, f1, force);
ATOMIC_ADD(&force[atoms.x], (mm_ulong) realToFixedPoint(f1.x)); applyForce(atoms.y, f2, force);
ATOMIC_ADD(&force[atoms.x+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(f1.y)); applyForce(atoms.z, f3, force);
ATOMIC_ADD(&force[atoms.x+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(f1.z));
MEM_FENCE;
}
if (atoms.y > -1) {
ATOMIC_ADD(&force[atoms.y], (mm_ulong) realToFixedPoint(f2.x));
ATOMIC_ADD(&force[atoms.y+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(f2.y));
ATOMIC_ADD(&force[atoms.y+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(f2.z));
MEM_FENCE;
}
if (atoms.z > -1) {
ATOMIC_ADD(&force[atoms.z], (mm_ulong) realToFixedPoint(f3.x));
ATOMIC_ADD(&force[atoms.z+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(f3.y));
ATOMIC_ADD(&force[atoms.z+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(f3.z));
MEM_FENCE;
}
} }
SYNC_WARPS;
applyForce(atoms2.x, localData[LOCAL_ID].f1, force);
applyForce(atoms2.y, localData[LOCAL_ID].f2, force);
applyForce(atoms2.z, localData[LOCAL_ID].f3, force);
} }
energyBuffer[GLOBAL_ID] += energy; energyBuffer[GLOBAL_ID] += energy;
} }
/**
* Compute forces on acceptors.
*/
KERNEL void computeAcceptorForces(
GLOBAL mm_ulong* RESTRICT force,
GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const real4* RESTRICT posq, GLOBAL const int4* RESTRICT exclusions,
GLOBAL const int4* RESTRICT donorAtoms, GLOBAL const int4* RESTRICT acceptorAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
PARAMETER_ARGUMENTS) {
LOCAL real4 posBuffer[3*THREAD_BLOCK_SIZE];
real3 f1 = make_real3(0);
real3 f2 = make_real3(0);
real3 f3 = make_real3(0);
for (int acceptorStart = 0; acceptorStart < NUM_ACCEPTORS; acceptorStart += GLOBAL_SIZE) {
// Load information about the acceptor this thread will compute forces on.
int acceptorIndex = acceptorStart+GLOBAL_ID;
int4 atoms, exclusionIndices;
real4 a1, a2, a3;
if (acceptorIndex < NUM_ACCEPTORS) {
atoms = acceptorAtoms[acceptorIndex];
a1 = (atoms.x > -1 ? posq[atoms.x] : make_real4(0));
a2 = (atoms.y > -1 ? posq[atoms.y] : make_real4(0));
a3 = (atoms.z > -1 ? posq[atoms.z] : make_real4(0));
#ifdef USE_EXCLUSIONS
exclusionIndices = exclusions[acceptorIndex];
#endif
}
else
atoms = make_int4(-1, -1, -1, -1);
for (int donorStart = 0; donorStart < NUM_DONORS; donorStart += LOCAL_SIZE) {
// Load the next block of donors into local memory.
SYNC_THREADS;
int blockSize = min((int) LOCAL_SIZE, NUM_DONORS-donorStart);
if (LOCAL_ID < blockSize) {
int4 atoms2 = donorAtoms[donorStart+LOCAL_ID];
posBuffer[3*LOCAL_ID] = (atoms2.x > -1 ? posq[atoms2.x] : make_real4(0));
posBuffer[3*LOCAL_ID+1] = (atoms2.y > -1 ? posq[atoms2.y] : make_real4(0));
posBuffer[3*LOCAL_ID+2] = (atoms2.z > -1 ? posq[atoms2.z] : make_real4(0));
}
SYNC_THREADS;
if (acceptorIndex < NUM_ACCEPTORS) {
for (int index = 0; index < blockSize; index++) {
int donorIndex = donorStart+index;
#ifdef USE_EXCLUSIONS
if (donorIndex == exclusionIndices.x || donorIndex == exclusionIndices.y || donorIndex == exclusionIndices.z || donorIndex == exclusionIndices.w)
continue;
#endif
// Compute the interaction between a donor and an acceptor.
real4 d1 = posBuffer[3*index];
real4 d2 = posBuffer[3*index+1];
real4 d3 = posBuffer[3*index+2];
real4 deltaD1A1 = delta(d1, a1, periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ);
#ifdef USE_CUTOFF
if (deltaD1A1.w < CUTOFF_SQUARED) {
#endif
COMPUTE_ACCEPTOR_FORCE
#ifdef USE_CUTOFF
}
#endif
}
}
}
// Write results
if (acceptorIndex < NUM_ACCEPTORS) {
if (atoms.x > -1) {
ATOMIC_ADD(&force[atoms.x], (mm_ulong) realToFixedPoint(f1.x));
ATOMIC_ADD(&force[atoms.x+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(f1.y));
ATOMIC_ADD(&force[atoms.x+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(f1.z));
MEM_FENCE;
}
if (atoms.y > -1) {
ATOMIC_ADD(&force[atoms.y], (mm_ulong) realToFixedPoint(f2.x));
ATOMIC_ADD(&force[atoms.y+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(f2.y));
ATOMIC_ADD(&force[atoms.y+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(f2.z));
MEM_FENCE;
}
if (atoms.z > -1) {
ATOMIC_ADD(&force[atoms.z], (mm_ulong) realToFixedPoint(f3.x));
ATOMIC_ADD(&force[atoms.z+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(f3.y));
ATOMIC_ADD(&force[atoms.z+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(f3.z));
MEM_FENCE;
}
}
}
}
...@@ -309,6 +309,43 @@ void testParameters() { ...@@ -309,6 +309,43 @@ void testParameters() {
ASSERT_EQUAL_TOL(2*(2*1.8+2.1)+2*(2*1.5+2.1), state.getPotentialEnergy(), TOL); ASSERT_EQUAL_TOL(2*(2*1.8+2.1)+2*(2*1.5+2.1), state.getPotentialEnergy(), TOL);
} }
void testLargeSystem() {
int numParticles = 5000;
System system;
CustomHbondForce* custom = new CustomHbondForce("distance(d1,a1)^2");
vector<Vec3> positions(numParticles);
OpenMM_SFMT::SFMT sfmt;
init_gen_rand(0, sfmt);
for (int i = 0; i < numParticles; i++) {
system.addParticle(1.0);
if (i%2 == 0)
custom->addDonor(i, -1, -1);
else
custom->addAcceptor(i, -1, -1);
positions[i] = Vec3(3.0*genrand_real2(sfmt), 3.0*genrand_real2(sfmt), 3.0*genrand_real2(sfmt));
}
system.addForce(custom);
VerletIntegrator integrator(0.01);
Context context(system, integrator, platform);
context.setPositions(positions);
State state = context.getState(State::Energy | State::Forces);
double expectedEnergy = 0;
for (int i = 0; i < numParticles; i += 2) {
for (int j = 1; j < numParticles; j += 2) {
Vec3 d = positions[i]-positions[j];
double r = sqrt(d.dot(d));
expectedEnergy += r*r;
}
}
ASSERT_EQUAL_TOL(expectedEnergy, state.getPotentialEnergy(), 1e-5);
for (int i = 0; i < numParticles; i += 2) {
Vec3 expectedForce;
for (int j = 1; j < numParticles; j += 2)
expectedForce += 2*(positions[j]-positions[i]);
ASSERT_EQUAL_VEC(expectedForce, state.getForces()[i], 1e-5);
}
}
void runPlatformTests(); void runPlatformTests();
int main(int argc, char* argv[]) { int main(int argc, char* argv[]) {
...@@ -321,6 +358,7 @@ int main(int argc, char* argv[]) { ...@@ -321,6 +358,7 @@ int main(int argc, char* argv[]) {
test2DFunction(); test2DFunction();
testIllegalVariable(); testIllegalVariable();
testParameters(); testParameters();
testLargeSystem();
runPlatformTests(); runPlatformTests();
} }
catch(const exception& e) { catch(const exception& e) {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment