Commit 705ec545 authored by Peter Eastman's avatar Peter Eastman
Browse files

Optimizations to Ewald summation (in progress)

parent 530e4ea0
...@@ -432,7 +432,7 @@ void gpuSetEwaldParameters(gpuContext gpu)//, float alphaEwald, int kmax ) ...@@ -432,7 +432,7 @@ void gpuSetEwaldParameters(gpuContext gpu)//, float alphaEwald, int kmax )
gpu->sim.alphaEwald = alpha; gpu->sim.alphaEwald = alpha;
gpu->sim.factorEwald = -1 / (4*alpha*alpha); gpu->sim.factorEwald = -1 / (4*alpha*alpha);
gpu->sim.kmax = 20+1; gpu->sim.kmax = 20+1;
gpu->psEwaldEikr = new CUDAStream<float2>(gpu->sim.atoms*gpu->sim.kmax, 1, "EwaldEikr"); gpu->psEwaldEikr = new CUDAStream<float2>(gpu->sim.atoms*gpu->sim.kmax*3, 1, "EwaldEikr");
gpu->sim.pEwaldEikr = gpu->psEwaldEikr->_pDevStream[0]; gpu->sim.pEwaldEikr = gpu->psEwaldEikr->_pDevStream[0];
gpu->psEwaldCosSinSum = new CUDAStream<float2>((gpu->sim.kmax*2-1) * (gpu->sim.kmax*2-1) * (gpu->sim.kmax*2-1), 1, "EwaldCosSinSum"); gpu->psEwaldCosSinSum = new CUDAStream<float2>((gpu->sim.kmax*2-1) * (gpu->sim.kmax*2-1) * (gpu->sim.kmax*2-1), 1, "EwaldCosSinSum");
gpu->sim.pEwaldCosSinSum = gpu->psEwaldCosSinSum->_pDevStream[0]; gpu->sim.pEwaldCosSinSum = gpu->psEwaldCosSinSum->_pDevStream[0];
......
...@@ -24,25 +24,25 @@ ...@@ -24,25 +24,25 @@
* along with this program. If not, see <http://www.gnu.org/licenses/>. * * along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */ * -------------------------------------------------------------------------- */
/* Define multiply operations for floats */ /* Define multiply operations for floats */
__device__ float2 MultofFloat2(float2 a, float2 b) __device__ float2 MultofFloat2(float2 a, float2 b)
{ {
float2 c; float2 c;
c.x = a.x * b.x - a.y * b.y; c.x = a.x * b.x - a.y * b.y;
c.y = a.x * b.y + a.y * b.x; c.y = a.x * b.y + a.y * b.x;
return c; return c;
}
} __device__ float2 ConjMultofFloat2(float2 a, float2 b)
{
__device__ float2 ConjMultofFloat2(float2 a, float2 b)
{
float2 c; float2 c;
c.x = a.x*b.x + a.y*b.y; c.x = a.x*b.x + a.y*b.y;
c.y = a.y*b.x - a.x*b.y; c.y = a.y*b.x - a.x*b.y;
return c; return c;
}
} #define EIR(x, y, z) cSim.pEwaldEikr[(x)+(y)*cSim.kmax+(z)*cSim.kmax*3]
__global__ void kCalculateEwaldFastEikr_kernel() __global__ void kCalculateEwaldFastEikr_kernel()
{ {
...@@ -63,23 +63,23 @@ __global__ void kCalculateEwaldFastEikr_kernel() ...@@ -63,23 +63,23 @@ __global__ void kCalculateEwaldFastEikr_kernel()
// k = 0, explicitly // k = 0, explicitly
for(unsigned int m = 0; (m < 3); m++) { for(unsigned int m = 0; (m < 3); m++) {
cSim.pEwaldEikr[atom*kmax*3 + 0 + m].x = 1; EIR(atom, 0, m).x = 1;
cSim.pEwaldEikr[atom*kmax*3 + 0 + m].y = 0; EIR(atom, 0, m).y = 0;
} }
// k = 1, explicitly // k = 1, explicitly
cSim.pEwaldEikr[atom*kmax*3 + 3 + 0].x = cos(apos.x*cSim.recipBoxSizeX); EIR(atom, 1, 0).x = cos(apos.x*cSim.recipBoxSizeX);
cSim.pEwaldEikr[atom*kmax*3 + 3 + 0].y = sin(apos.x*cSim.recipBoxSizeX); EIR(atom, 1, 0).y = sin(apos.x*cSim.recipBoxSizeX);
cSim.pEwaldEikr[atom*kmax*3 + 3 + 1].x = cos(apos.y*cSim.recipBoxSizeY); EIR(atom, 1, 1).x = cos(apos.y*cSim.recipBoxSizeY);
cSim.pEwaldEikr[atom*kmax*3 + 3 + 1].y = sin(apos.y*cSim.recipBoxSizeY); EIR(atom, 1, 1).y = sin(apos.y*cSim.recipBoxSizeY);
cSim.pEwaldEikr[atom*kmax*3 + 3 + 2].x = cos(apos.z*cSim.recipBoxSizeZ); EIR(atom, 1, 2).x = cos(apos.z*cSim.recipBoxSizeZ);
cSim.pEwaldEikr[atom*kmax*3 + 3 + 2].y = sin(apos.z*cSim.recipBoxSizeZ); EIR(atom, 1, 2).y = sin(apos.z*cSim.recipBoxSizeZ);
// k > 1, by recursion // k > 1, by recursion
for(unsigned int k=2; (k<kmax); k++) { for(unsigned int k=2; (k<kmax); k++) {
for(unsigned int m=0; (m<3); m++) { for(unsigned int m=0; (m<3); m++) {
cSim.pEwaldEikr[atom*kmax*3 + k*3 + m] = MultofFloat2 (cSim.pEwaldEikr[atom*kmax*3 + (k-1)*3 + m] , cSim.pEwaldEikr[atom*kmax*3 + 3 + m]); EIR(atom, k, m) = MultofFloat2(EIR(atom, k-1, m), EIR(atom, 1, m));
} }
} }
...@@ -102,11 +102,11 @@ __global__ void kCalculateEwaldFastCosSinSums_kernel() ...@@ -102,11 +102,11 @@ __global__ void kCalculateEwaldFastCosSinSums_kernel()
float2 sum = make_float2(0.0f, 0.0f); float2 sum = make_float2(0.0f, 0.0f);
for (int atom = 0; atom < cSim.atoms; atom++) for (int atom = 0; atom < cSim.atoms; atom++)
{ {
float2 tab_xy = (ry >= 0 ? MultofFloat2 (cSim.pEwaldEikr[atom*cSim.kmax*3 + rx*3 + 0], cSim.pEwaldEikr[atom*cSim.kmax*3 + ry*3 + 1]) : float2 tab_xy = (ry >= 0 ? MultofFloat2(EIR(atom, rx, 0), EIR(atom, ry, 1)) :
ConjMultofFloat2 (cSim.pEwaldEikr[atom*cSim.kmax*3 + rx*3 + 0], cSim.pEwaldEikr[atom*cSim.kmax*3 - ry*3 + 1])); ConjMultofFloat2(EIR(atom, rx, 0), EIR(atom, -ry, 1)));
float charge = cSim.pPosq[atom].w; float charge = cSim.pPosq[atom].w;
float2 structureFactor = (rz >= 0 ? MultofFloat2(tab_xy, cSim.pEwaldEikr[atom*cSim.kmax*3 + rz*3 + 2]) : float2 structureFactor = (rz >= 0 ? MultofFloat2(tab_xy, EIR(atom, rz, 2)) :
ConjMultofFloat2(tab_xy, cSim.pEwaldEikr[atom*cSim.kmax*3 - rz*3 + 2])); ConjMultofFloat2(tab_xy, EIR(atom, -rz, 2)));
sum.x += charge*structureFactor.x; sum.x += charge*structureFactor.x;
sum.y += charge*structureFactor.y; sum.y += charge*structureFactor.y;
} }
...@@ -132,29 +132,32 @@ __global__ void kCalculateEwaldFastForces_kernel() ...@@ -132,29 +132,32 @@ __global__ void kCalculateEwaldFastForces_kernel()
while (atom < cSim.atoms) while (atom < cSim.atoms)
{ {
float4 force = cSim.pForce4[atom];
float charge = cSim.pPosq[atom].w; float charge = cSim.pPosq[atom].w;
for (int rx = 0; rx < numRx; rx++) { for (int rx = 0; rx < numRx; rx++) {
float kx = rx * cSim.recipBoxSizeX; float kx = rx * cSim.recipBoxSizeX;
for (int ry = lowry; ry < numRy; ry++) { for (int ry = lowry; ry < numRy; ry++) {
float ky = ry * cSim.recipBoxSizeY; float ky = ry * cSim.recipBoxSizeY;
float2 tab_xy = (ry >= 0 ? MultofFloat2(cSim.pEwaldEikr[atom*cSim.kmax*3 + rx*3 + 0], cSim.pEwaldEikr[atom*cSim.kmax*3 + ry*3 + 1]) : float2 tab_xy = (ry >= 0 ? MultofFloat2(EIR(atom, rx, 0), EIR(atom, ry, 1)) :
ConjMultofFloat2(cSim.pEwaldEikr[atom*cSim.kmax*3 + rx*3 + 0], cSim.pEwaldEikr[atom*cSim.kmax*3 - ry*3 + 1])); ConjMultofFloat2(EIR(atom, rx, 0), EIR(atom, -ry, 1)));
for (int rz = lowrz; rz < numRz; rz++) { for (int rz = lowrz; rz < numRz; rz++) {
float kz = rz * cSim.recipBoxSizeZ; float kz = rz * cSim.recipBoxSizeZ;
int index = rx*(numRy*2-1)*(numRz*2-1) + (ry+numRy-1)*(numRz*2-1) + (rz+numRz-1); int index = rx*(numRy*2-1)*(numRz*2-1) + (ry+numRy-1)*(numRz*2-1) + (rz+numRz-1);
float k2 = kx*kx + ky*ky + kz*kz; float k2 = kx*kx + ky*ky + kz*kz;
float ak = exp(k2*cSim.factorEwald) / k2; float ak = exp(k2*cSim.factorEwald) / k2;
float2 structureFactor = (rz >= 0 ? MultofFloat2(tab_xy, cSim.pEwaldEikr[atom*cSim.kmax*3 + rz*3 + 2]) : float2 structureFactor = (rz >= 0 ? MultofFloat2(tab_xy, EIR(atom, rz, 2)) :
ConjMultofFloat2(tab_xy, cSim.pEwaldEikr[atom*cSim.kmax*3 - rz*3 + 2])); ConjMultofFloat2(tab_xy, EIR(atom, -rz, 2)));
float dEdR = ak * charge * (cSim.pEwaldCosSinSum[index].x*structureFactor.y - cSim.pEwaldCosSinSum[index].y*structureFactor.x); float2 cosSinSum = cSim.pEwaldCosSinSum[index];
cSim.pForce4[atom].x += 2 * recipCoeff * dEdR * kx; float dEdR = ak * charge * (cosSinSum.x*structureFactor.y - cosSinSum.y*structureFactor.x);
cSim.pForce4[atom].y += 2 * recipCoeff * dEdR * ky; force.x += 2 * recipCoeff * dEdR * kx;
cSim.pForce4[atom].z += 2 * recipCoeff * dEdR * kz; force.y += 2 * recipCoeff * dEdR * ky;
force.z += 2 * recipCoeff * dEdR * kz;
lowrz = 1 - numRz; lowrz = 1 - numRz;
} }
lowry = 1 - numRy; lowry = 1 - numRy;
} }
} }
cSim.pForce4[atom] = force;
atom += blockDim.x * gridDim.x; atom += blockDim.x * gridDim.x;
} }
} }
...@@ -32,74 +32,54 @@ ...@@ -32,74 +32,54 @@
__global__ void kCalculateCDLJEwaldReciprocalForces_kernel() __global__ void kCalculateCDLJEwaldReciprocalForces_kernel()
{ {
float eps0 = 5.72765E-4; const float eps0 = 1.0f/(4.0f*3.1415926535f*cSim.epsfac);
int numRx = 20+1;
int numRy = 20+1;
int numRz = 20+1;
int lowry, lowrz;
float kx, ky, kz, k2, ek;
float Qi, Qj, SinI, SinJ, CosI, CosJ;
float V = cSim.cellVolume;
float4 apos1, apos2 ; // i nteracting atoms
float4 af; // atomic force
unsigned int atomID1 = threadIdx.x + blockIdx.x * blockDim.x; unsigned int atomID1 = threadIdx.x + blockIdx.x * blockDim.x;
while (atomID1 < cSim.atoms) while (atomID1 < cSim.atoms)
{ {
apos1 = cSim.pPosq[atomID1]; float4 apos1 = cSim.pPosq[atomID1];
af = cSim.pForce4[atomID1]; float4 af = cSim.pForce4[atomID1];
unsigned int atomID2 = 0; unsigned int atomID2 = 0;
while (atomID2 < cSim.atoms) while (atomID2 < cSim.atoms)
{ {
apos2 = cSim.pPosq[atomID2]; float4 apos2 = cSim.pPosq[atomID2];
float scale = 2.0f*apos1.w*apos2.w/(cSim.cellVolume*eps0);
lowry = 0;
lowrz = 1;
for(int rx = 0; rx < numRx; rx++) {
kx = rx * cSim.recipBoxSizeX;
for(int ry = lowry; ry < numRy; ry++) {
ky = ry * cSim.recipBoxSizeY;
for (int rz = lowrz; rz < numRz; rz++) { int lowry = 0;
int lowrz = 1;
kz = rz * cSim.recipBoxSizeZ; for(int rx = 0; rx < cSim.kmax; rx++)
{
k2 = kx * kx + ky * ky + kz * kz; float kx = rx*cSim.recipBoxSizeX;
ek = exp ( k2 * cSim.factorEwald); for(int ry = lowry; ry < cSim.kmax; ry++)
{
Qi = apos1.w ; float ky = ry*cSim.recipBoxSizeY;
Qj = apos2.w ; for (int rz = lowrz; rz < cSim.kmax; rz++)
{
SinI = sin ( kx * apos1.x + ky * apos1.y + kz * apos1.z ); float kz = rz*cSim.recipBoxSizeZ;
SinJ = sin ( kx * apos2.x + ky * apos2.y + kz * apos2.z ); float k2 = kx*kx + ky*ky + kz*kz;
CosI = cos ( kx * apos1.x + ky * apos1.y + kz * apos1.z ); float ek = exp(k2*cSim.factorEwald);
CosJ = cos ( kx * apos2.x + ky * apos2.y + kz * apos2.z );
float arg1 = kx*apos1.x + ky*apos1.y + kz*apos1.z;
af.x -= (2.0 / (V * eps0 )) * Qi * ( kx/k2) * ek * ( - SinI * Qj * CosJ + CosI * Qj * SinJ); float arg2 = kx*apos2.x + ky*apos2.y + kz*apos2.z;
af.y -= (2.0 / (V * eps0 )) * Qi * ( ky/k2) * ek * ( - SinI * Qj * CosJ + CosI * Qj * SinJ); float sinI = sin(arg1);
af.z -= (2.0 / (V * eps0 )) * Qi * ( kz/k2) * ek * ( - SinI * Qj * CosJ + CosI * Qj * SinJ); float sinJ = sin(arg2);
float cosI = cos(arg1);
lowrz = 1 - numRz; float cosJ = cos(arg2);
float f = scale * ek * (-sinI*cosJ + cosI*sinJ) / k2;
af.x -= kx*f;
af.y -= ky*f;
af.z -= kz*f;
lowrz = 1 - cSim.kmax;
} }
lowry = 1 - cSim.kmax;
lowry = 1 - numRy;
} }
} }
atomID2++; atomID2++;
} }
cSim.pForce4[atomID1] = af; cSim.pForce4[atomID1] = af;
atomID1 += blockDim.x * gridDim.x; atomID1 += blockDim.x * gridDim.x;
......
...@@ -155,7 +155,7 @@ __global__ void kLangevinUpdatePart2_kernel() ...@@ -155,7 +155,7 @@ __global__ void kLangevinUpdatePart2_kernel()
CM.x += mass * velocity.x; CM.x += mass * velocity.x;
CM.y += mass * velocity.y; CM.y += mass * velocity.y;
CM.z += mass * velocity.z; CM.z += mass * velocity.z;
#endif; #endif
Xmh.x = vVector.x * cSim.TauDOverEMMinusOne + Xmh.x = vVector.x * cSim.TauDOverEMMinusOne +
sqrtInvMass * random4b.x; sqrtInvMass * random4b.x;
......
...@@ -36,6 +36,8 @@ ...@@ -36,6 +36,8 @@
// make sure that erf() and erfc() are defined. // make sure that erf() and erfc() are defined.
#include "MSVC_erfc.h" #include "MSVC_erfc.h"
using std::vector;
/**--------------------------------------------------------------------------------------- /**---------------------------------------------------------------------------------------
ReferenceLJCoulombIxn constructor ReferenceLJCoulombIxn constructor
...@@ -279,9 +281,9 @@ int ReferenceLJCoulombIxn::calculateEwaldIxn( int numberOfAtoms, RealOpenMM** at ...@@ -279,9 +281,9 @@ int ReferenceLJCoulombIxn::calculateEwaldIxn( int numberOfAtoms, RealOpenMM** at
// setup K-vectors // setup K-vectors
#define EIR(x, y, z) eir[(x)*numberOfAtoms*3+(y)*3+z] #define EIR(x, y, z) eir[(x)*numberOfAtoms*3+(y)*3+z]
d_complex* eir = new d_complex[kmax*numberOfAtoms*3]; vector<d_complex> eir(kmax*numberOfAtoms*3);
d_complex* tab_xy = new d_complex[numberOfAtoms]; vector<d_complex> tab_xy(numberOfAtoms);
d_complex* tab_qxyz = new d_complex[numberOfAtoms]; vector<d_complex> tab_qxyz(numberOfAtoms);
if (kmax < 1) { if (kmax < 1) {
std::stringstream message; std::stringstream message;
...@@ -389,7 +391,7 @@ int ReferenceLJCoulombIxn::calculateEwaldIxn( int numberOfAtoms, RealOpenMM** at ...@@ -389,7 +391,7 @@ int ReferenceLJCoulombIxn::calculateEwaldIxn( int numberOfAtoms, RealOpenMM** at
// allocate and initialize exclusion array // allocate and initialize exclusion array
int* exclusionIndices = new int[numberOfAtoms]; vector<int> exclusionIndices(numberOfAtoms);
for( int ii = 0; ii < numberOfAtoms; ii++ ){ for( int ii = 0; ii < numberOfAtoms; ii++ ){
exclusionIndices[ii] = -1; exclusionIndices[ii] = -1;
} }
...@@ -428,11 +430,6 @@ int ReferenceLJCoulombIxn::calculateEwaldIxn( int numberOfAtoms, RealOpenMM** at ...@@ -428,11 +430,6 @@ int ReferenceLJCoulombIxn::calculateEwaldIxn( int numberOfAtoms, RealOpenMM** at
} }
} }
delete[] exclusionIndices;
delete[] eir;
delete[] tab_xy;
delete[] tab_qxyz;
// *********************************************************************** // ***********************************************************************
if( totalEnergy ) { if( totalEnergy ) {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment