Commit 86573494 authored by Peter Eastman's avatar Peter Eastman
Browse files

Optimizations to PME electrostatics kernel

parent 747dd2bc
...@@ -1051,6 +1051,8 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const ...@@ -1051,6 +1051,8 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
electrostaticsSource << CudaKernelSources::vectorOps; electrostaticsSource << CudaKernelSources::vectorOps;
electrostaticsSource << CudaAmoebaKernelSources::pmeMultipoleElectrostatics; electrostaticsSource << CudaAmoebaKernelSources::pmeMultipoleElectrostatics;
electrostaticsSource << CudaAmoebaKernelSources::pmeElectrostaticPairForce; electrostaticsSource << CudaAmoebaKernelSources::pmeElectrostaticPairForce;
electrostaticsSource << "#define APPLY_SCALE\n";
electrostaticsSource << CudaAmoebaKernelSources::pmeElectrostaticPairForce;
} }
else { else {
electrostaticsSource << CudaKernelSources::vectorOps; electrostaticsSource << CudaKernelSources::vectorOps;
......
#define APPLY_SCALE __device__ void
#ifdef APPLY_SCALE
__device__ void computeOneInteractionF1(AtomData& atom1, volatile AtomData& atom2, real4 delta, real4 bn, real bn5, float forceFactor, computeOneInteractionF1(
#else
computeOneInteractionF1NoScale(
#endif
AtomData& atom1, volatile AtomData& atom2, real4 delta, real4 bn, real bn5, float forceFactor,
#ifdef APPLY_SCALE #ifdef APPLY_SCALE
float dScale, float pScale, float mScale, float dScale, float pScale, float mScale,
#endif #endif
...@@ -165,7 +169,13 @@ __device__ void computeOneInteractionF1(AtomData& atom1, volatile AtomData& atom ...@@ -165,7 +169,13 @@ __device__ void computeOneInteractionF1(AtomData& atom1, volatile AtomData& atom
} }
__device__ void computeOneInteractionF2(AtomData& atom1, volatile AtomData& atom2, real4 delta, real4 bn, float forceFactor, __device__ void
#ifdef APPLY_SCALE
computeOneInteractionF2(
#else
computeOneInteractionF2NoScale(
#endif
AtomData& atom1, volatile AtomData& atom2, real4 delta, real4 bn, float forceFactor,
#ifdef APPLY_SCALE #ifdef APPLY_SCALE
float dScale, float pScale, float mScale, float dScale, float pScale, float mScale,
#endif #endif
...@@ -601,7 +611,13 @@ __device__ void computeOneInteractionF2(AtomData& atom1, volatile AtomData& atom ...@@ -601,7 +611,13 @@ __device__ void computeOneInteractionF2(AtomData& atom1, volatile AtomData& atom
} }
__device__ void computeOneInteractionT1(AtomData& atom1, volatile AtomData& atom2, const real4 delta, const real4 bn __device__ void
#ifdef APPLY_SCALE
computeOneInteractionT1(
#else
computeOneInteractionT1NoScale(
#endif
AtomData& atom1, volatile AtomData& atom2, const real4 delta, const real4 bn
#ifdef APPLY_SCALE #ifdef APPLY_SCALE
, float dScale, float pScale, float mScale , float dScale, float pScale, float mScale
#endif #endif
...@@ -761,7 +777,13 @@ __device__ void computeOneInteractionT1(AtomData& atom1, volatile AtomData& atom ...@@ -761,7 +777,13 @@ __device__ void computeOneInteractionT1(AtomData& atom1, volatile AtomData& atom
} }
__device__ void computeOneInteractionT2(AtomData& atom1, volatile AtomData& atom2, const real4 delta, const real4 bn __device__ void
#ifdef APPLY_SCALE
computeOneInteractionT2(
#else
computeOneInteractionT2NoScale(
#endif
AtomData& atom1, volatile AtomData& atom2, const real4 delta, const real4 bn
#ifdef APPLY_SCALE #ifdef APPLY_SCALE
, float dScale, float pScale, float mScale , float dScale, float pScale, float mScale
#endif #endif
...@@ -800,13 +822,11 @@ __device__ void computeOneInteractionT2(AtomData& atom1, volatile AtomData& atom ...@@ -800,13 +822,11 @@ __device__ void computeOneInteractionT2(AtomData& atom1, volatile AtomData& atom
real pgamma = atom1.thole < atom2.thole ? atom1.thole : atom2.thole; real pgamma = atom1.thole < atom2.thole ? atom1.thole : atom2.thole;
real ratio = RECIP(rr1*damp); real ratio = RECIP(rr1*damp);
damp = -pgamma*ratio*ratio*ratio; damp = -pgamma*ratio*ratio*ratio;
if (damp > -50) {
real expdamp = EXP(damp); real expdamp = EXP(damp);
scale3 = 1 - expdamp; scale3 = 1 - expdamp;
scale5 = 1 - (1-damp)*expdamp; scale5 = 1 - (1-damp)*expdamp;
scale7 = 1 - (1-damp+0.6f*damp*damp)*expdamp; scale7 = 1 - (1-damp+0.6f*damp*damp)*expdamp;
} }
}
real rr3 = rr1*rr1*rr1; real rr3 = rr1*rr1*rr1;
#ifdef APPLY_SCALE #ifdef APPLY_SCALE
...@@ -928,107 +948,3 @@ __device__ void computeOneInteractionT2(AtomData& atom1, volatile AtomData& atom ...@@ -928,107 +948,3 @@ __device__ void computeOneInteractionT2(AtomData& atom1, volatile AtomData& atom
atom1.torque.y += ttm2i2; atom1.torque.y += ttm2i2;
atom1.torque.z += ttm2i3; atom1.torque.z += ttm2i3;
} }
__device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, bool hasExclusions, float dScale, float pScale, float mScale, float forceFactor,
real& energy, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
float4 delta;
delta.x = atom2.pos.x - atom1.pos.x;
delta.y = atom2.pos.y - atom1.pos.y;
delta.z = atom2.pos.z - atom1.pos.z;
// periodic box
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
delta.w = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
if (delta.w > CUTOFF_SQUARED)
return;
real r = SQRT(delta.w);
real ralpha = EWALD_ALPHA*r;
real alsq2 = 2*EWALD_ALPHA*EWALD_ALPHA;
real alsq2n = 0;
if (EWALD_ALPHA > 0)
alsq2n = RECIP(SQRT_PI*EWALD_ALPHA);
real exp2a = EXP(-(ralpha*ralpha));
real rr1 = RECIP(r);
delta.w = rr1;
real bn0 = erfc(ralpha)*rr1;
energy += forceFactor*atom1.q*atom2.q*bn0;
real rr2 = rr1*rr1;
alsq2n *= alsq2;
float4 bn;
bn.x = (bn0+alsq2n*exp2a)*rr2;
alsq2n *= alsq2;
bn.y = (3*bn.x+alsq2n*exp2a)*rr2;
alsq2n *= alsq2;
bn.z = (5*bn.y+alsq2n*exp2a)*rr2;
alsq2n *= alsq2;
bn.w = (7*bn.z+alsq2n*exp2a)*rr2;
alsq2n *= alsq2;
real bn5 = (9*bn.w+alsq2n*exp2a)*rr2;
real3 force;
// if (hasExclusions) {
computeOneInteractionF1(atom1, atom2, delta, bn, bn5, forceFactor, dScale, pScale, mScale, force, energy);
computeOneInteractionF2(atom1, atom2, delta, bn, forceFactor, dScale, pScale, mScale, force, energy);
// } else {
// computeOneInteractionF1(atom1, atom2, delta, bn, bn5, forceFactor, force, energy);
// computeOneInteractionF2(atom1, atom2, delta, bn, forceFactor, force, energy);
// }
atom1.force += force;
if (forceFactor == 1)
atom2.force -= force;
computeOneInteractionT1(atom1, atom2, delta, bn, dScale, pScale, mScale);
computeOneInteractionT2(atom1, atom2, delta, bn, dScale, pScale, mScale);
if (forceFactor == 1) {
// T3 == T1 w/ particles I and J reversed
// T4 == T2 w/ particles I and J reversed
delta.x = -delta.x;
delta.y = -delta.y;
delta.z = -delta.z;
computeOneInteractionT1(atom2, atom1, delta, bn, dScale, pScale, mScale);
computeOneInteractionT2(atom2, atom1, delta, bn, dScale, pScale, mScale);
}
}
/**
* Compute the self energy and self torque.
*/
__device__ void computeSelfEnergyAndTorque(AtomData& atom1, real& energy) {
real term = 2*EWALD_ALPHA*EWALD_ALPHA;
real fterm = -EWALD_ALPHA/SQRT_PI;
real cii = atom1.q*atom1.q;
real dii = dot(atom1.dipole, atom1.dipole);
real qii = 2*(atom1.quadrupoleXX*atom1.quadrupoleXX +
atom1.quadrupoleYY*atom1.quadrupoleYY +
atom1.quadrupoleXX*atom1.quadrupoleYY +
atom1.quadrupoleXY*atom1.quadrupoleXY +
atom1.quadrupoleXZ*atom1.quadrupoleXZ +
atom1.quadrupoleYZ*atom1.quadrupoleYZ);
real uii = dot(atom1.dipole, atom1.inducedDipole);
real selfEnergy = (cii + term*(dii/3 + 2*term*qii/5));
selfEnergy += term*uii/3;
selfEnergy *= fterm;
energy += selfEnergy;
// self-torque for PME
real3 ui = atom1.inducedDipole+atom1.inducedDipolePolar;
atom1.torque += ((2/(real) 3)*(EWALD_ALPHA*EWALD_ALPHA*EWALD_ALPHA)/SQRT_PI)*cross(atom1.dipole, ui);
}
\ No newline at end of file
...@@ -8,8 +8,14 @@ typedef struct { ...@@ -8,8 +8,14 @@ typedef struct {
float thole, damp, padding; float thole, damp, padding;
} AtomData; } AtomData;
__device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, bool hasExclusions, float dScale, float pScale, float mScale, float forceFactor, real& energy, real4 periodicBoxSize, real4 invPeriodicBoxSize); __device__ void computeOneInteractionF1(AtomData& atom1, volatile AtomData& atom2, real4 delta, real4 bn, real bn5, float forceFactor, float dScale, float pScale, float mScale, real3& force, real& energy);
__device__ void computeSelfEnergyAndTorque(AtomData& atom1, real& energy); __device__ void computeOneInteractionF2(AtomData& atom1, volatile AtomData& atom2, real4 delta, real4 bn, float forceFactor, float dScale, float pScale, float mScale, real3& force, real& energy);
__device__ void computeOneInteractionT1(AtomData& atom1, volatile AtomData& atom2, const real4 delta, const real4 bn, float dScale, float pScale, float mScale);
__device__ void computeOneInteractionT2(AtomData& atom1, volatile AtomData& atom2, const real4 delta, const real4 bn, float dScale, float pScale, float mScale);
__device__ void computeOneInteractionF1NoScale(AtomData& atom1, volatile AtomData& atom2, real4 delta, real4 bn, real bn5, float forceFactor, real3& force, real& energy);
__device__ void computeOneInteractionF2NoScale(AtomData& atom1, volatile AtomData& atom2, real4 delta, real4 bn, float forceFactor, real3& force, real& energy);
__device__ void computeOneInteractionT1NoScale(AtomData& atom1, volatile AtomData& atom2, const real4 delta, const real4 bn);
__device__ void computeOneInteractionT2NoScale(AtomData& atom1, volatile AtomData& atom2, const real4 delta, const real4 bn);
inline __device__ void loadAtomData(AtomData& data, int atom, const real4* __restrict__ posq, const real* __restrict__ labFrameDipole, inline __device__ void loadAtomData(AtomData& data, int atom, const real4* __restrict__ posq, const real* __restrict__ labFrameDipole,
const real* __restrict__ labFrameQuadrupole, const real* __restrict__ inducedDipole, const real* __restrict__ inducedDipolePolar, const float2* __restrict__ dampingAndThole) { const real* __restrict__ labFrameQuadrupole, const real* __restrict__ inducedDipole, const real* __restrict__ inducedDipolePolar, const float2* __restrict__ dampingAndThole) {
...@@ -52,6 +58,123 @@ __device__ float computePScaleFactor(uint2 covalent, unsigned int polarizationGr ...@@ -52,6 +58,123 @@ __device__ float computePScaleFactor(uint2 covalent, unsigned int polarizationGr
return (x && y ? 0.0f : (x && p ? 0.5f : 1.0f)); return (x && y ? 0.0f : (x && p ? 0.5f : 1.0f));
} }
__device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, bool hasExclusions, float dScale, float pScale, float mScale, float forceFactor,
real& energy, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
float4 delta;
delta.x = atom2.pos.x - atom1.pos.x;
delta.y = atom2.pos.y - atom1.pos.y;
delta.z = atom2.pos.z - atom1.pos.z;
// periodic box
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
delta.w = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
if (delta.w > CUTOFF_SQUARED)
return;
real r = SQRT(delta.w);
real ralpha = EWALD_ALPHA*r;
real alsq2 = 2*EWALD_ALPHA*EWALD_ALPHA;
real alsq2n = 0;
if (EWALD_ALPHA > 0)
alsq2n = RECIP(SQRT_PI*EWALD_ALPHA);
real exp2a = EXP(-(ralpha*ralpha));
real rr1 = RECIP(r);
delta.w = rr1;
real bn0 = erfc(ralpha)*rr1;
energy += forceFactor*atom1.q*atom2.q*bn0;
real rr2 = rr1*rr1;
alsq2n *= alsq2;
float4 bn;
bn.x = (bn0+alsq2n*exp2a)*rr2;
alsq2n *= alsq2;
bn.y = (3*bn.x+alsq2n*exp2a)*rr2;
alsq2n *= alsq2;
bn.z = (5*bn.y+alsq2n*exp2a)*rr2;
alsq2n *= alsq2;
bn.w = (7*bn.z+alsq2n*exp2a)*rr2;
alsq2n *= alsq2;
real bn5 = (9*bn.w+alsq2n*exp2a)*rr2;
real3 force;
if (hasExclusions) {
computeOneInteractionF1(atom1, atom2, delta, bn, bn5, forceFactor, dScale, pScale, mScale, force, energy);
computeOneInteractionF2(atom1, atom2, delta, bn, forceFactor, dScale, pScale, mScale, force, energy);
}
else {
computeOneInteractionF1NoScale(atom1, atom2, delta, bn, bn5, forceFactor, force, energy);
computeOneInteractionF2NoScale(atom1, atom2, delta, bn, forceFactor, force, energy);
}
atom1.force += force;
if (forceFactor == 1)
atom2.force -= force;
if (hasExclusions) {
computeOneInteractionT1(atom1, atom2, delta, bn, dScale, pScale, mScale);
computeOneInteractionT2(atom1, atom2, delta, bn, dScale, pScale, mScale);
}
else {
computeOneInteractionT1NoScale(atom1, atom2, delta, bn);
computeOneInteractionT2NoScale(atom1, atom2, delta, bn);
}
if (forceFactor == 1) {
// T3 == T1 w/ particles I and J reversed
// T4 == T2 w/ particles I and J reversed
delta.x = -delta.x;
delta.y = -delta.y;
delta.z = -delta.z;
if (hasExclusions) {
computeOneInteractionT1(atom2, atom1, delta, bn, dScale, pScale, mScale);
computeOneInteractionT2(atom2, atom1, delta, bn, dScale, pScale, mScale);
}
else {
computeOneInteractionT1NoScale(atom2, atom1, delta, bn);
computeOneInteractionT2NoScale(atom2, atom1, delta, bn);
}
}
}
/**
* Compute the self energy and self torque.
*/
__device__ void computeSelfEnergyAndTorque(AtomData& atom1, real& energy) {
real term = 2*EWALD_ALPHA*EWALD_ALPHA;
real fterm = -EWALD_ALPHA/SQRT_PI;
real cii = atom1.q*atom1.q;
real dii = dot(atom1.dipole, atom1.dipole);
real qii = 2*(atom1.quadrupoleXX*atom1.quadrupoleXX +
atom1.quadrupoleYY*atom1.quadrupoleYY +
atom1.quadrupoleXX*atom1.quadrupoleYY +
atom1.quadrupoleXY*atom1.quadrupoleXY +
atom1.quadrupoleXZ*atom1.quadrupoleXZ +
atom1.quadrupoleYZ*atom1.quadrupoleYZ);
real uii = dot(atom1.dipole, atom1.inducedDipole);
real selfEnergy = (cii + term*(dii/3 + 2*term*qii/5));
selfEnergy += term*uii/3;
selfEnergy *= fterm;
energy += selfEnergy;
// self-torque for PME
real3 ui = atom1.inducedDipole+atom1.inducedDipolePolar;
atom1.torque += ((2/(real) 3)*(EWALD_ALPHA*EWALD_ALPHA*EWALD_ALPHA)/SQRT_PI)*cross(atom1.dipole, ui);
}
/** /**
* Compute electrostatic interactions. * Compute electrostatic interactions.
*/ */
...@@ -175,120 +298,93 @@ extern "C" __global__ void computeElectrostatics( ...@@ -175,120 +298,93 @@ extern "C" __global__ void computeElectrostatics(
localData[threadIdx.x].force = make_real3(0); localData[threadIdx.x].force = make_real3(0);
localData[threadIdx.x].torque = make_real3(0); localData[threadIdx.x].torque = make_real3(0);
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
// unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF); unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
// if (!hasExclusions && flags != 0xFFFFFFFF) { if (!hasExclusions && flags != 0xFFFFFFFF) {
// if (flags == 0) { if (flags == 0) {
// // No interactions in this tile. // No interactions in this tile.
// } }
// else { else {
// // Compute only a subset of the interactions in this tile. // Compute only a subset of the interactions in this tile.
//
// for (j = 0; j < TILE_SIZE; j++) { for (j = 0; j < TILE_SIZE; j++) {
// if ((flags&(1<<j)) != 0) { if ((flags&(1<<j)) != 0) {
// int atom2 = tbx+j; int atom2 = tbx+j;
// computeOneInteraction(data, localData[tbx+j], false, 1, 1, 1, 1, energy, periodicBoxSize, invPeriodicBoxSize)); real3 oldForce = localData[atom2].force;
// data.force += tempForce; real3 oldTorque = localData[atom2].torque;
// localData[atom2].force -= tempForce; localData[atom2].force = make_real3(0);
// energy += tempEnergy; localData[atom2].torque = make_real3(0);
// if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) { computeOneInteraction(data, localData[tbx+j], false, 1, 1, 1, 1, energy, periodicBoxSize, invPeriodicBoxSize);
//#ifdef ENABLE_SHUFFLE real3 newForce = localData[atom2].force;
// for (int i = 16; i >= 1; i /= 2) { real3 newTorque = localData[atom2].torque;
// tempForce.x += __shfl_xor(tempForce.x, i, 32); if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
// tempForce.y += __shfl_xor(tempForce.y, i, 32); #ifdef ENABLE_SHUFFLE
// tempForce.z += __shfl_xor(tempForce.z, i, 32); for (int i = 16; i >= 1; i /= 2) {
// } newForce.x += __shfl_xor(newForce.x, i, 32);
// if (tgx == 0) newForce.y += __shfl_xor(newForce.y, i, 32);
// localData[atom2].force -= tempForce; newForce.z += __shfl_xor(newForce.z, i, 32);
//#else newTorque.x += __shfl_xor(newTorque.x, i, 32);
// int bufferIndex = 3*threadIdx.x; newTorque.y += __shfl_xor(newTorque.y, i, 32);
// tempBuffer[bufferIndex] = tempForce.x; newTorque.z += __shfl_xor(newTorque.z, i, 32);
// tempBuffer[bufferIndex+1] = tempForce.y; }
// tempBuffer[bufferIndex+2] = tempForce.z; if (tgx == 0) {
// if (tgx % 4 == 0) { localData[atom2].force -= newForce;
// tempBuffer[bufferIndex] += tempBuffer[bufferIndex+3]+tempBuffer[bufferIndex+6]+tempBuffer[bufferIndex+9]; localData[atom2].torque -= newTorque;
// tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+4]+tempBuffer[bufferIndex+7]+tempBuffer[bufferIndex+10]; }
// tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+5]+tempBuffer[bufferIndex+8]+tempBuffer[bufferIndex+11]; #else
// } int bufferIndex = 3*threadIdx.x;
// if (tgx == 0) { tempBuffer[bufferIndex] = newForce.x;
// localData[atom2].force.x -= tempBuffer[bufferIndex]+tempBuffer[bufferIndex+12]+tempBuffer[bufferIndex+24]+tempBuffer[bufferIndex+36]+tempBuffer[bufferIndex+48]+tempBuffer[bufferIndex+60]+tempBuffer[bufferIndex+72]+tempBuffer[bufferIndex+84]; tempBuffer[bufferIndex+1] = newForce.y;
// localData[atom2].force.y -= tempBuffer[bufferIndex+1]+tempBuffer[bufferIndex+13]+tempBuffer[bufferIndex+25]+tempBuffer[bufferIndex+37]+tempBuffer[bufferIndex+49]+tempBuffer[bufferIndex+61]+tempBuffer[bufferIndex+73]+tempBuffer[bufferIndex+85]; tempBuffer[bufferIndex+2] = newForce.z;
// localData[atom2].force.z -= tempBuffer[bufferIndex+2]+tempBuffer[bufferIndex+14]+tempBuffer[bufferIndex+26]+tempBuffer[bufferIndex+38]+tempBuffer[bufferIndex+50]+tempBuffer[bufferIndex+62]+tempBuffer[bufferIndex+74]+tempBuffer[bufferIndex+86]; if (tgx % 4 == 0) {
// } tempBuffer[bufferIndex] += tempBuffer[bufferIndex+3]+tempBuffer[bufferIndex+6]+tempBuffer[bufferIndex+9];
//#endif tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+4]+tempBuffer[bufferIndex+7]+tempBuffer[bufferIndex+10];
// } tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+5]+tempBuffer[bufferIndex+8]+tempBuffer[bufferIndex+11];
// } }
// } if (tgx == 0) {
// data.force *= -ENERGY_SCALE_FACTOR; localData[atom2].force.x -= tempBuffer[bufferIndex]+tempBuffer[bufferIndex+12]+tempBuffer[bufferIndex+24]+tempBuffer[bufferIndex+36]+tempBuffer[bufferIndex+48]+tempBuffer[bufferIndex+60]+tempBuffer[bufferIndex+72]+tempBuffer[bufferIndex+84];
// localData[threadIdx.x].force *= -ENERGY_SCALE_FACTOR; localData[atom2].force.y -= tempBuffer[bufferIndex+1]+tempBuffer[bufferIndex+13]+tempBuffer[bufferIndex+25]+tempBuffer[bufferIndex+37]+tempBuffer[bufferIndex+49]+tempBuffer[bufferIndex+61]+tempBuffer[bufferIndex+73]+tempBuffer[bufferIndex+85];
// if (pos < end) { localData[atom2].force.z -= tempBuffer[bufferIndex+2]+tempBuffer[bufferIndex+14]+tempBuffer[bufferIndex+26]+tempBuffer[bufferIndex+38]+tempBuffer[bufferIndex+50]+tempBuffer[bufferIndex+62]+tempBuffer[bufferIndex+74]+tempBuffer[bufferIndex+86];
// unsigned int offset = x*TILE_SIZE + tgx; }
// atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (data.force.x*0xFFFFFFFF))); tempBuffer[bufferIndex] = newTorque.x;
// atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0xFFFFFFFF))); tempBuffer[bufferIndex+1] = newTorque.y;
// atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0xFFFFFFFF))); tempBuffer[bufferIndex+2] = newTorque.z;
// offset = y*TILE_SIZE + tgx; if (tgx % 4 == 0) {
// atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0xFFFFFFFF))); tempBuffer[bufferIndex] += tempBuffer[bufferIndex+3]+tempBuffer[bufferIndex+6]+tempBuffer[bufferIndex+9];
// atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0xFFFFFFFF))); tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+4]+tempBuffer[bufferIndex+7]+tempBuffer[bufferIndex+10];
// atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0xFFFFFFFF))); tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+5]+tempBuffer[bufferIndex+8]+tempBuffer[bufferIndex+11];
// } }
// if (tgx == 0) {
// // Compute torques. localData[atom2].torque.x -= tempBuffer[bufferIndex]+tempBuffer[bufferIndex+12]+tempBuffer[bufferIndex+24]+tempBuffer[bufferIndex+36]+tempBuffer[bufferIndex+48]+tempBuffer[bufferIndex+60]+tempBuffer[bufferIndex+72]+tempBuffer[bufferIndex+84];
// localData[atom2].torque.y -= tempBuffer[bufferIndex+1]+tempBuffer[bufferIndex+13]+tempBuffer[bufferIndex+25]+tempBuffer[bufferIndex+37]+tempBuffer[bufferIndex+49]+tempBuffer[bufferIndex+61]+tempBuffer[bufferIndex+73]+tempBuffer[bufferIndex+85];
// for (j = 0; j < TILE_SIZE; j++) { localData[atom2].torque.z -= tempBuffer[bufferIndex+2]+tempBuffer[bufferIndex+14]+tempBuffer[bufferIndex+26]+tempBuffer[bufferIndex+38]+tempBuffer[bufferIndex+50]+tempBuffer[bufferIndex+62]+tempBuffer[bufferIndex+74]+tempBuffer[bufferIndex+86];
// if ((flags&(1<<j)) != 0) { }
// int atom2 = tbx+j; #endif
// real3 delta = make_real3(localData[atom2].posq.x-data.posq.x, localData[atom2].posq.y-data.posq.y, localData[atom2].posq.z-data.posq.z); }
//#ifdef USE_PERIODIC }
// delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; }
// delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y; data.force *= -ENERGY_SCALE_FACTOR;
// delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z; data.torque *= -ENERGY_SCALE_FACTOR;
//#endif localData[threadIdx.x].force *= -ENERGY_SCALE_FACTOR;
// real3 tempForce; localData[threadIdx.x].torque *= -ENERGY_SCALE_FACTOR;
// computeOneInteractionT1(data, localData[atom2], 1, 1, 1, tempForce); if (pos < end) {
// data.force += tempForce; unsigned int offset = x*TILE_SIZE + tgx;
// computeOneInteractionT3(data, localData[atom2], 1, 1, 1, tempForce); atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (data.force.x*0xFFFFFFFF)));
// if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) { atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0xFFFFFFFF)));
//#ifdef ENABLE_SHUFFLE atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0xFFFFFFFF)));
// for (int i = 16; i >= 1; i /= 2) { atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (data.torque.x*0xFFFFFFFF)));
// tempForce.x += __shfl_xor(tempForce.x, i, 32); atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.torque.y*0xFFFFFFFF)));
// tempForce.y += __shfl_xor(tempForce.y, i, 32); atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.torque.z*0xFFFFFFFF)));
// tempForce.z += __shfl_xor(tempForce.z, i, 32); offset = y*TILE_SIZE + tgx;
// } atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0xFFFFFFFF)));
// if (tgx == 0) atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0xFFFFFFFF)));
// localData[atom2].force -= tempForce; atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0xFFFFFFFF)));
//#else atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].torque.x*0xFFFFFFFF)));
// int bufferIndex = 3*threadIdx.x; atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].torque.y*0xFFFFFFFF)));
// tempBuffer[bufferIndex] = tempForce.x; atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].torque.z*0xFFFFFFFF)));
// tempBuffer[bufferIndex+1] = tempForce.y; }
// tempBuffer[bufferIndex+2] = tempForce.z; }
// if (tgx % 4 == 0) { }
// tempBuffer[bufferIndex] += tempBuffer[bufferIndex+3]+tempBuffer[bufferIndex+6]+tempBuffer[bufferIndex+9]; else
// tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+4]+tempBuffer[bufferIndex+7]+tempBuffer[bufferIndex+10];
// tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+5]+tempBuffer[bufferIndex+8]+tempBuffer[bufferIndex+11];
// }
// if (tgx == 0) {
// localData[atom2].force.x += tempBuffer[bufferIndex]+tempBuffer[bufferIndex+12]+tempBuffer[bufferIndex+24]+tempBuffer[bufferIndex+36]+tempBuffer[bufferIndex+48]+tempBuffer[bufferIndex+60]+tempBuffer[bufferIndex+72]+tempBuffer[bufferIndex+84];
// localData[atom2].force.y += tempBuffer[bufferIndex+1]+tempBuffer[bufferIndex+13]+tempBuffer[bufferIndex+25]+tempBuffer[bufferIndex+37]+tempBuffer[bufferIndex+49]+tempBuffer[bufferIndex+61]+tempBuffer[bufferIndex+73]+tempBuffer[bufferIndex+85];
// localData[atom2].force.z += tempBuffer[bufferIndex+2]+tempBuffer[bufferIndex+14]+tempBuffer[bufferIndex+26]+tempBuffer[bufferIndex+38]+tempBuffer[bufferIndex+50]+tempBuffer[bufferIndex+62]+tempBuffer[bufferIndex+74]+tempBuffer[bufferIndex+86];
// }
//#endif
// }
// }
// }
// data.force *= ENERGY_SCALE_FACTOR;
// localData[threadIdx.x].force *= ENERGY_SCALE_FACTOR;
// if (pos < end) {
// unsigned int offset = x*TILE_SIZE + tgx;
// atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (data.force.x*0xFFFFFFFF)));
// atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0xFFFFFFFF)));
// atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0xFFFFFFFF)));
// offset = y*TILE_SIZE + tgx;
// atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0xFFFFFFFF)));
// atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0xFFFFFFFF)));
// atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0xFFFFFFFF)));
// }
// }
// }
// else
#endif #endif
{ {
// Compute the full set of interactions in this tile. // Compute the full set of interactions in this tile.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment