Commit 9eb02774 authored by Peter Eastman's avatar Peter Eastman
Browse files

Optimizations to Ewald summation

parent cfafe7f2
...@@ -337,7 +337,6 @@ struct cudaGmxSimulation { ...@@ -337,7 +337,6 @@ struct cudaGmxSimulation {
float collisionProbability; // Collision probability for Andersen thermostat float collisionProbability; // Collision probability for Andersen thermostat
float2* pObcData; // Pointer to fixed Born data float2* pObcData; // Pointer to fixed Born data
float2* pAttr; // Pointer to additional atom attributes (sig, eps) float2* pAttr; // Pointer to additional atom attributes (sig, eps)
float2* pEwaldEikr; // Pointer to exponents of reciprocal vectors and atom coordinates (ewald)
float2* pEwaldCosSinSum; // Pointer to the cos/sin sums (ewald) float2* pEwaldCosSinSum; // Pointer to the cos/sin sums (ewald)
unsigned int bonds; // Number of bonds unsigned int bonds; // Number of bonds
int4* pBondID; // Bond atom and output buffer IDs int4* pBondID; // Bond atom and output buffer IDs
......
...@@ -432,8 +432,6 @@ void gpuSetEwaldParameters(gpuContext gpu)//, float alphaEwald, int kmax ) ...@@ -432,8 +432,6 @@ void gpuSetEwaldParameters(gpuContext gpu)//, float alphaEwald, int kmax )
gpu->sim.alphaEwald = alpha; gpu->sim.alphaEwald = alpha;
gpu->sim.factorEwald = -1 / (4*alpha*alpha); gpu->sim.factorEwald = -1 / (4*alpha*alpha);
gpu->sim.kmax = 20+1; gpu->sim.kmax = 20+1;
gpu->psEwaldEikr = new CUDAStream<float2>(gpu->sim.atoms*gpu->sim.kmax*3, 1, "EwaldEikr");
gpu->sim.pEwaldEikr = gpu->psEwaldEikr->_pDevStream[0];
gpu->psEwaldCosSinSum = new CUDAStream<float2>((gpu->sim.kmax*2-1) * (gpu->sim.kmax*2-1) * (gpu->sim.kmax*2-1), 1, "EwaldCosSinSum"); gpu->psEwaldCosSinSum = new CUDAStream<float2>((gpu->sim.kmax*2-1) * (gpu->sim.kmax*2-1) * (gpu->sim.kmax*2-1), 1, "EwaldCosSinSum");
gpu->sim.pEwaldCosSinSum = gpu->psEwaldCosSinSum->_pDevStream[0]; gpu->sim.pEwaldCosSinSum = gpu->psEwaldCosSinSum->_pDevStream[0];
} }
...@@ -1274,7 +1272,6 @@ void* gpuInit(int numAtoms) ...@@ -1274,7 +1272,6 @@ void* gpuInit(int numAtoms)
gpu->psRbDihedralParameter2 = NULL; gpu->psRbDihedralParameter2 = NULL;
gpu->psLJ14ID = NULL; gpu->psLJ14ID = NULL;
gpu->psLJ14Parameter = NULL; gpu->psLJ14Parameter = NULL;
gpu->psEwaldEikr = NULL;
gpu->psEwaldCosSinSum = NULL; gpu->psEwaldCosSinSum = NULL;
gpu->psShakeID = NULL; gpu->psShakeID = NULL;
gpu->psShakeParameter = NULL; gpu->psShakeParameter = NULL;
...@@ -1404,11 +1401,8 @@ void gpuShutDown(gpuContext gpu) ...@@ -1404,11 +1401,8 @@ void gpuShutDown(gpuContext gpu)
delete gpu->psxVector4; delete gpu->psxVector4;
delete gpu->psvVector4; delete gpu->psvVector4;
delete gpu->psSigEps2; delete gpu->psSigEps2;
if (gpu->psEwaldEikr != NULL) if (gpu->psEwaldCosSinSum != NULL)
{
delete gpu->psEwaldEikr;
delete gpu->psEwaldCosSinSum; delete gpu->psEwaldCosSinSum;
}
delete gpu->psObcData; delete gpu->psObcData;
delete gpu->psObcChain; delete gpu->psObcChain;
delete gpu->psBornForce; delete gpu->psBornForce;
......
...@@ -86,7 +86,6 @@ struct _gpuContext { ...@@ -86,7 +86,6 @@ struct _gpuContext {
CUDAStream<float4>* psxVector4; CUDAStream<float4>* psxVector4;
CUDAStream<float4>* psvVector4; CUDAStream<float4>* psvVector4;
CUDAStream<float2>* psSigEps2; CUDAStream<float2>* psSigEps2;
CUDAStream<float2>* psEwaldEikr;
CUDAStream<float2>* psEwaldCosSinSum; CUDAStream<float2>* psEwaldCosSinSum;
CUDAStream<float2>* psObcData; CUDAStream<float2>* psObcData;
CUDAStream<float>* psObcChain; CUDAStream<float>* psObcChain;
......
...@@ -24,6 +24,11 @@ ...@@ -24,6 +24,11 @@
* along with this program. If not, see <http://www.gnu.org/licenses/>. * * along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */ * -------------------------------------------------------------------------- */
/**
* This file contains the kernel for evaluating nonbonded forces using the
* Ewald summation method (Reciprocal space summation).
*/
/* Define multiply operations for floats */ /* Define multiply operations for floats */
__device__ float2 MultofFloat2(float2 a, float2 b) __device__ float2 MultofFloat2(float2 a, float2 b)
...@@ -42,50 +47,9 @@ __device__ float2 ConjMultofFloat2(float2 a, float2 b) ...@@ -42,50 +47,9 @@ __device__ float2 ConjMultofFloat2(float2 a, float2 b)
return c; return c;
} }
#define EIR(x, y, z) cSim.pEwaldEikr[(x)+(y)*cSim.kmax+(z)*cSim.kmax*3] /**
* Precompute the cosine and sine sums which appear in each force term.
__global__ void kCalculateEwaldFastEikr_kernel() */
{
int kmax = cSim.kmax;
float4 apos;
unsigned int atom = threadIdx.x + blockIdx.x * blockDim.x;
while (atom < cSim.atoms)
{
apos = cSim.pPosq[atom];
//generic form of the array
// pEikr[ atomID*kmax*3 + k*3 + m]
// k = 0, explicitly
for(unsigned int m = 0; (m < 3); m++) {
EIR(atom, 0, m).x = 1;
EIR(atom, 0, m).y = 0;
}
// k = 1, explicitly
EIR(atom, 1, 0).x = cos(apos.x*cSim.recipBoxSizeX);
EIR(atom, 1, 0).y = sin(apos.x*cSim.recipBoxSizeX);
EIR(atom, 1, 1).x = cos(apos.y*cSim.recipBoxSizeY);
EIR(atom, 1, 1).y = sin(apos.y*cSim.recipBoxSizeY);
EIR(atom, 1, 2).x = cos(apos.z*cSim.recipBoxSizeZ);
EIR(atom, 1, 2).y = sin(apos.z*cSim.recipBoxSizeZ);
// k > 1, by recursion
for(unsigned int k=2; (k<kmax); k++) {
for(unsigned int m=0; (m<3); m++) {
EIR(atom, k, m) = MultofFloat2(EIR(atom, k-1, m), EIR(atom, 1, m));
}
}
atom += blockDim.x * gridDim.x;
}
}
__global__ void kCalculateEwaldFastCosSinSums_kernel() __global__ void kCalculateEwaldFastCosSinSums_kernel()
{ {
...@@ -94,27 +58,42 @@ __global__ void kCalculateEwaldFastCosSinSums_kernel() ...@@ -94,27 +58,42 @@ __global__ void kCalculateEwaldFastCosSinSums_kernel()
unsigned int index = threadIdx.x + blockIdx.x * blockDim.x; unsigned int index = threadIdx.x + blockIdx.x * blockDim.x;
while (index < totalK) while (index < totalK)
{ {
// Find the wave vector (kx, ky, kz) this index corresponds to.
int rx = index/(ksize*ksize); int rx = index/(ksize*ksize);
int remainder = index - rx*ksize*ksize; int remainder = index - rx*ksize*ksize;
int ry = remainder/ksize; int ry = remainder/ksize;
int rz = remainder - ry*ksize - cSim.kmax + 1; int rz = remainder - ry*ksize - cSim.kmax + 1;
ry += -cSim.kmax + 1; ry += -cSim.kmax + 1;
float kx = rx*cSim.recipBoxSizeX;
float ky = ry*cSim.recipBoxSizeY;
float kz = rz*cSim.recipBoxSizeZ;
// Compute the sum for this wave vector.
float2 sum = make_float2(0.0f, 0.0f); float2 sum = make_float2(0.0f, 0.0f);
for (int atom = 0; atom < cSim.atoms; atom++) for (int atom = 0; atom < cSim.atoms; atom++)
{ {
float2 tab_xy = (ry >= 0 ? MultofFloat2(EIR(atom, rx, 0), EIR(atom, ry, 1)) : float4 apos = cSim.pPosq[atom];
ConjMultofFloat2(EIR(atom, rx, 0), EIR(atom, -ry, 1))); float phase = apos.x*kx;
float charge = cSim.pPosq[atom].w; float2 structureFactor = make_float2(cos(phase), sin(phase));
float2 structureFactor = (rz >= 0 ? MultofFloat2(tab_xy, EIR(atom, rz, 2)) : phase = apos.y*ky;
ConjMultofFloat2(tab_xy, EIR(atom, -rz, 2))); structureFactor = MultofFloat2(structureFactor, make_float2(cos(phase), sin(phase)));
sum.x += charge*structureFactor.x; phase = apos.z*kz;
sum.y += charge*structureFactor.y; structureFactor = MultofFloat2(structureFactor, make_float2(cos(phase), sin(phase)));
sum.x += apos.w*structureFactor.x;
sum.y += apos.w*structureFactor.y;
} }
cSim.pEwaldCosSinSum[index] = sum; cSim.pEwaldCosSinSum[index] = sum;
index += blockDim.x * gridDim.x; index += blockDim.x * gridDim.x;
} }
} }
/**
* Compute the reciprocal space part of the Ewald force, using the precomputed sums from the
* previous routine.
*/
__global__ void kCalculateEwaldFastForces_kernel() __global__ void kCalculateEwaldFastForces_kernel()
{ {
...@@ -122,8 +101,6 @@ __global__ void kCalculateEwaldFastForces_kernel() ...@@ -122,8 +101,6 @@ __global__ void kCalculateEwaldFastForces_kernel()
const float epsilon = 1.0; const float epsilon = 1.0;
float recipCoeff = cSim.epsfac*(4*PI/cSim.cellVolume/epsilon); float recipCoeff = cSim.epsfac*(4*PI/cSim.cellVolume/epsilon);
int lowry = 0;
int lowrz = 1;
const int numRx = cSim.kmax; const int numRx = cSim.kmax;
const int numRy = cSim.kmax; const int numRy = cSim.kmax;
const int numRz = cSim.kmax; const int numRz = cSim.kmax;
...@@ -133,22 +110,32 @@ __global__ void kCalculateEwaldFastForces_kernel() ...@@ -133,22 +110,32 @@ __global__ void kCalculateEwaldFastForces_kernel()
while (atom < cSim.atoms) while (atom < cSim.atoms)
{ {
float4 force = cSim.pForce4[atom]; float4 force = cSim.pForce4[atom];
float charge = cSim.pPosq[atom].w; float4 apos = cSim.pPosq[atom];
// Loop over all wave vectors.
int lowry = 0;
int lowrz = 1;
for (int rx = 0; rx < numRx; rx++) { for (int rx = 0; rx < numRx; rx++) {
float kx = rx * cSim.recipBoxSizeX; float kx = rx * cSim.recipBoxSizeX;
for (int ry = lowry; ry < numRy; ry++) { for (int ry = lowry; ry < numRy; ry++) {
float ky = ry * cSim.recipBoxSizeY; float ky = ry * cSim.recipBoxSizeY;
float2 tab_xy = (ry >= 0 ? MultofFloat2(EIR(atom, rx, 0), EIR(atom, ry, 1)) : float phase = apos.x*kx;
ConjMultofFloat2(EIR(atom, rx, 0), EIR(atom, -ry, 1))); float2 tab_xy = make_float2(cos(phase), sin(phase));
phase = apos.y*ky;
tab_xy = MultofFloat2(tab_xy, make_float2(cos(phase), sin(phase)));
for (int rz = lowrz; rz < numRz; rz++) { for (int rz = lowrz; rz < numRz; rz++) {
float kz = rz * cSim.recipBoxSizeZ; float kz = rz * cSim.recipBoxSizeZ;
// Compute the force contribution of this wave vector.
int index = rx*(numRy*2-1)*(numRz*2-1) + (ry+numRy-1)*(numRz*2-1) + (rz+numRz-1); int index = rx*(numRy*2-1)*(numRz*2-1) + (ry+numRy-1)*(numRz*2-1) + (rz+numRz-1);
float k2 = kx*kx + ky*ky + kz*kz; float k2 = kx*kx + ky*ky + kz*kz;
float ak = exp(k2*cSim.factorEwald) / k2; float ak = exp(k2*cSim.factorEwald) / k2;
float2 structureFactor = (rz >= 0 ? MultofFloat2(tab_xy, EIR(atom, rz, 2)) : phase = apos.z*kz;
ConjMultofFloat2(tab_xy, EIR(atom, -rz, 2))); float2 structureFactor = MultofFloat2(tab_xy, make_float2(cos(phase), sin(phase)));
float2 cosSinSum = cSim.pEwaldCosSinSum[index]; float2 cosSinSum = cSim.pEwaldCosSinSum[index];
float dEdR = ak * charge * (cosSinSum.x*structureFactor.y - cosSinSum.y*structureFactor.x); float dEdR = ak * apos.w * (cosSinSum.x*structureFactor.y - cosSinSum.y*structureFactor.x);
force.x += 2 * recipCoeff * dEdR * kx; force.x += 2 * recipCoeff * dEdR * kx;
force.y += 2 * recipCoeff * dEdR * ky; force.y += 2 * recipCoeff * dEdR * ky;
force.z += 2 * recipCoeff * dEdR * kz; force.z += 2 * recipCoeff * dEdR * kz;
...@@ -157,6 +144,9 @@ __global__ void kCalculateEwaldFastForces_kernel() ...@@ -157,6 +144,9 @@ __global__ void kCalculateEwaldFastForces_kernel()
lowry = 1 - numRy; lowry = 1 - numRy;
} }
} }
// Record the force on the atom.
cSim.pForce4[atom] = force; cSim.pForce4[atom] = force;
atom += blockDim.x * gridDim.x; atom += blockDim.x * gridDim.x;
} }
......
...@@ -212,10 +212,8 @@ void kCalculateCDLJForces(gpuContext gpu) ...@@ -212,10 +212,8 @@ void kCalculateCDLJForces(gpuContext gpu)
} }
else else
{ {
// O(N3/2) Ewald summation // Fast Ewald summation
kCalculateEwaldFastEikr_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>(); kCalculateEwaldFastCosSinSums_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block>>>();
LAUNCHERROR("kCalculateEwaldFastEikr");
kCalculateEwaldFastCosSinSums_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
LAUNCHERROR("kCalculateEwaldFastCosSinSums"); LAUNCHERROR("kCalculateEwaldFastCosSinSums");
kCalculateEwaldFastForces_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>(); kCalculateEwaldFastForces_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
LAUNCHERROR("kCalculateEwaldFastForces"); LAUNCHERROR("kCalculateEwaldFastForces");
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment