Commit 96f3680d authored by Peter Eastman's avatar Peter Eastman
Browse files

Fixed CUDA implementation of O(n^3/2) Ewald

parent 86a4baba
......@@ -300,7 +300,7 @@ struct cudaGmxSimulation {
float cellVolume; // Ewald parameter alpha (a.k.a. kappa)
float alphaEwald; // Ewald parameter alpha (a.k.a. kappa)
float factorEwald; // - 1 ( 4 * alphaEwald * alphaEwald)
float kmax; // Maximum number of reciprocal vectors
int kmax; // Maximum number of reciprocal vectors
float reactionFieldK; // Constant for reaction field correction
float probeRadius; // SASA probe radius
float surfaceAreaFactor; // ACE approximation surface area factor
......@@ -337,9 +337,8 @@ struct cudaGmxSimulation {
float collisionProbability; // Collision probability for Andersen thermostat
float2* pObcData; // Pointer to fixed Born data
float2* pAttr; // Pointer to additional atom attributes (sig, eps)
float2* pEikr; // Pointer to exponents of reciprocal vectors and atom coordinates (ewald)
float2* pStructureFactor; // Pointer to the structure factors (ewald)
float2* pCosSinSum; // Pointer to the cos/sin sums (ewald)
float2* pEwaldEikr; // Pointer to exponents of reciprocal vectors and atom coordinates (ewald)
float2* pEwaldCosSinSum; // Pointer to the cos/sin sums (ewald)
unsigned int bonds; // Number of bonds
int4* pBondID; // Bond atom and output buffer IDs
float2* pBondParameter; // Bond parameters
......
......@@ -429,18 +429,13 @@ void gpuSetEwaldParameters(gpuContext gpu)//, float alphaEwald, int kmax )
// hard coded alphaEwald and kmax, no interface yet
float alpha = 3.123413f;
float PI = 3.14159265358979323846f;
float TWO_PI = 2.0f * PI;
gpu->sim.recipBoxSizeX = TWO_PI / gpu->sim.periodicBoxSizeX ;
gpu->sim.recipBoxSizeY = TWO_PI / gpu->sim.periodicBoxSizeY ;
gpu->sim.recipBoxSizeZ = TWO_PI / gpu->sim.periodicBoxSizeZ ;
gpu->sim.cellVolume = gpu->sim.periodicBoxSizeX * gpu->sim.periodicBoxSizeY * gpu->sim.periodicBoxSizeZ;
gpu->sim.alphaEwald = alpha;
gpu->sim.factorEwald = -1 / (4*alpha*alpha);
gpu->sim.kmax = 20+1;
gpu->psEwaldEikr = new CUDAStream<float2>(gpu->sim.paddedNumberOfAtoms*gpu->sim.kmax, 1, "EwaldEikr");
gpu->sim.pEwaldEikr = gpu->psEwaldEikr->_pDevStream[0];
gpu->psEwaldCosSinSum = new CUDAStream<float2>((gpu->sim.kmax*2-1) * (gpu->sim.kmax*2-1) * (gpu->sim.kmax*2-1), 1, "EwaldCosSinSum");
gpu->sim.pEwaldCosSinSum = gpu->psEwaldCosSinSum->_pDevStream[0];
}
extern "C"
......@@ -449,6 +444,10 @@ void gpuSetPeriodicBoxSize(gpuContext gpu, float xsize, float ysize, float zsize
gpu->sim.periodicBoxSizeX = xsize;
gpu->sim.periodicBoxSizeY = ysize;
gpu->sim.periodicBoxSizeZ = zsize;
gpu->sim.recipBoxSizeX = 2.0f*PI/gpu->sim.periodicBoxSizeX;
gpu->sim.recipBoxSizeY = 2.0f*PI/gpu->sim.periodicBoxSizeY;
gpu->sim.recipBoxSizeZ = 2.0f*PI/gpu->sim.periodicBoxSizeZ;
gpu->sim.cellVolume = gpu->sim.periodicBoxSizeX*gpu->sim.periodicBoxSizeY*gpu->sim.periodicBoxSizeZ;
}
extern "C"
......@@ -1002,12 +1001,6 @@ int gpuAllocateInitialBuffers(gpuContext gpu)
gpu->sim.pObcChain = gpu->psObcChain->_pDevStream[0];
gpu->psSigEps2 = new CUDAStream<float2>(gpu->sim.paddedNumberOfAtoms, 1, "SigEps2");
gpu->sim.pAttr = gpu->psSigEps2->_pDevStream[0];
gpu->psEwaldEikr = new CUDAStream<float2>(gpu->sim.paddedNumberOfAtoms, 1, "EwaldEikr");
gpu->sim.pEikr = gpu->psEwaldEikr->_pDevStream[0];
gpu->psEwaldStructureFactor = new CUDAStream<float2>(gpu->sim.paddedNumberOfAtoms, 1, "EwaldStructureFactor");
gpu->sim.pStructureFactor = gpu->psEwaldStructureFactor->_pDevStream[0];
gpu->psEwaldCosSinSum = new CUDAStream<float2>(gpu->sim.paddedNumberOfAtoms, 1, "EwaldCosSinSum");
gpu->sim.pCosSinSum = gpu->psEwaldCosSinSum->_pDevStream[0];
gpu->psObcData = new CUDAStream<float2>(gpu->sim.paddedNumberOfAtoms, 1, "ObcData");
gpu->sim.pObcData = gpu->psObcData->_pDevStream[0];
gpu->psStepSize = new CUDAStream<float2>(1, 1, "StepSize");
......@@ -1281,6 +1274,8 @@ void* gpuInit(int numAtoms)
gpu->psRbDihedralParameter2 = NULL;
gpu->psLJ14ID = NULL;
gpu->psLJ14Parameter = NULL;
gpu->psEwaldEikr = NULL;
gpu->psEwaldCosSinSum = NULL;
gpu->psShakeID = NULL;
gpu->psShakeParameter = NULL;
gpu->psSettleID = NULL;
......@@ -1409,9 +1404,11 @@ void gpuShutDown(gpuContext gpu)
delete gpu->psxVector4;
delete gpu->psvVector4;
delete gpu->psSigEps2;
if (gpu->psEwaldEikr != NULL)
{
delete gpu->psEwaldEikr;
delete gpu->psEwaldStructureFactor;
delete gpu->psEwaldCosSinSum;
}
delete gpu->psObcData;
delete gpu->psObcChain;
delete gpu->psBornForce;
......
......@@ -87,7 +87,6 @@ struct _gpuContext {
CUDAStream<float4>* psvVector4;
CUDAStream<float2>* psSigEps2;
CUDAStream<float2>* psEwaldEikr;
CUDAStream<float2>* psEwaldStructureFactor;
CUDAStream<float2>* psEwaldCosSinSum;
CUDAStream<float2>* psObcData;
CUDAStream<float>* psObcChain;
......
......@@ -44,24 +44,6 @@
}
__device__ float2 FloatMultFloat2(float r, float2 a)
{
float2 b;
b.x = r*a.x;
b.y = r*a.y;
return b;
}
__device__ float2 FloatMultConjFloat2(float r, float2 a)
{
float2 b;
b.x = r*a.y;
b.y = r*a.x;
return b;
}
__global__ void kCalculateEwaldFastEikr_kernel()
{
......@@ -81,23 +63,23 @@ __global__ void kCalculateEwaldFastEikr_kernel()
// k = 0, explicitly
for(unsigned int m = 0; (m < 3); m++) {
cSim.pEikr[atom*kmax*3 + 0 + m].x = 1;
cSim.pEikr[atom*kmax*3 + 0 + m].y = 0;
cSim.pEwaldEikr[atom*kmax*3 + 0 + m].x = 1;
cSim.pEwaldEikr[atom*kmax*3 + 0 + m].y = 0;
}
// k = 1, explicitly
cSim.pEikr[atom*kmax*3 + 3 + 0].x = cos(apos.x*cSim.recipBoxSizeX);
cSim.pEikr[atom*kmax*3 + 3 + 0].y = sin(apos.x*cSim.recipBoxSizeX);
cSim.pEwaldEikr[atom*kmax*3 + 3 + 0].x = cos(apos.x*cSim.recipBoxSizeX);
cSim.pEwaldEikr[atom*kmax*3 + 3 + 0].y = sin(apos.x*cSim.recipBoxSizeX);
cSim.pEikr[atom*kmax*3 + 3 + 1].x = cos(apos.y*cSim.recipBoxSizeY);
cSim.pEikr[atom*kmax*3 + 3 + 1].y = sin(apos.y*cSim.recipBoxSizeY);
cSim.pEwaldEikr[atom*kmax*3 + 3 + 1].x = cos(apos.y*cSim.recipBoxSizeY);
cSim.pEwaldEikr[atom*kmax*3 + 3 + 1].y = sin(apos.y*cSim.recipBoxSizeY);
cSim.pEikr[atom*kmax*3 + 3 + 2].x = cos(apos.z*cSim.recipBoxSizeZ);
cSim.pEikr[atom*kmax*3 + 3 + 2].y = sin(apos.z*cSim.recipBoxSizeZ);
cSim.pEwaldEikr[atom*kmax*3 + 3 + 2].x = cos(apos.z*cSim.recipBoxSizeZ);
cSim.pEwaldEikr[atom*kmax*3 + 3 + 2].y = sin(apos.z*cSim.recipBoxSizeZ);
// k > 1, by recursion
for(unsigned int k=2; (k<kmax); k++) {
for(unsigned int m=0; (m<3); m++) {
cSim.pEikr[atom*kmax*3 + k*3 + m] = MultofFloat2 (cSim.pEikr[atom*kmax*3 + (k-1)*3 + m] , cSim.pEikr[atom*kmax*3 + 3 + m]);
cSim.pEwaldEikr[atom*kmax*3 + k*3 + m] = MultofFloat2 (cSim.pEwaldEikr[atom*kmax*3 + (k-1)*3 + m] , cSim.pEwaldEikr[atom*kmax*3 + 3 + m]);
}
}
......@@ -105,122 +87,32 @@ __global__ void kCalculateEwaldFastEikr_kernel()
}
}
__global__ void kCalculateEwaldFastStructureFactors_kernel()
{
// hard-coded maximum k-vectors, no interface yet
int kmax = cSim.kmax;
float4 apos;
int lowry = 0;
int lowrz = 1;
int numRx = 20+1;
int numRy = 20+1;
int numRz = 20+1;
unsigned int totalK = (numRx*2 - 1) * (numRy*2 - 1) * (numRz*2 - 1);
float2 tab_xy;
int index;
unsigned int atom = threadIdx.x + blockIdx.x * blockDim.x;
while (atom < cSim.atoms)
{
apos = cSim.pPosq[atom];
// cSim.pEikr[atom*kmax*3 + k*3 + m]
for(int rx = 0; rx < numRx; rx++)
{
for(int ry = lowry; ry < numRy; ry++)
{
if(ry >= 0)
{
tab_xy = MultofFloat2 (cSim.pEikr[atom*kmax*3 + rx*3 + 0] , cSim.pEikr[atom*kmax*3 + ry*3 + 1]);
}
else
{
tab_xy = ConjMultofFloat2 (cSim.pEikr[atom*kmax*3 + rx*3 + 0] , cSim.pEikr[atom*kmax*3 - ry*3 + 1]);
}
for (int rz = lowrz; rz < numRz; rz++)
{
index = rx * (numRy*2 - 1 ) * (numRz*2 - 1) + (ry + numRy -1 ) * (numRz * 2 - 1) + (rz + numRz -1 );
if( rz >= 0)
{
cSim.pStructureFactor[atom*totalK + index] = FloatMultFloat2 ( (apos.w ), MultofFloat2 (tab_xy ,cSim.pEikr[atom*kmax*3 + rz*3 + 2] ));
}
else
{
cSim.pStructureFactor[atom*totalK + index] = FloatMultFloat2 ( ( apos.w ), ConjMultofFloat2 (tab_xy ,cSim.pEikr[atom*kmax*3 - rz*3 + 2] ));
}
cSim.pCosSinSum[index].x = 0.0;
cSim.pCosSinSum[index].y = 0.0;
lowrz = 1 - numRz;
}
lowry = 1 - numRy;
}
}
atom += blockDim.x * gridDim.x;
}
}
__global__ void kCalculateEwaldFastCosSinSums_kernel()
{
// float2 eikr;
int lowry = 0;
int lowrz = 1;
int numRx = 20+1;
int numRy = 20+1;
int numRz = 20+1;
unsigned int totalK = (numRx*2 - 1) * (numRy*2 - 1) * (numRz*2 - 1);
int index;
unsigned int rx = threadIdx.x + blockIdx.x * blockDim.x;
while (rx < numRx)
{
// **********************************************************************
// cSim.pEikr[atom*kmax*3 + k*3 + m]
// for(int rx = 0; rx < numRx; rx++) {
for(int ry = lowry; ry < numRy; ry++) {
for (int rz = lowrz; rz < numRz; rz++) {
index = rx * (numRy*2 - 1 ) * (numRz*2 - 1) + (ry + numRy -1 ) * (numRz * 2 - 1) + (rz + numRz -1 );
const unsigned int ksize = 2*cSim.kmax-1;
const unsigned int totalK = ksize*ksize*ksize;
unsigned int index = threadIdx.x + blockIdx.x * blockDim.x;
while (index < totalK)
{
int rx = index/(ksize*ksize);
int remainder = index - rx*ksize*ksize;
int ry = remainder/ksize;
int rz = remainder - ry*ksize - cSim.kmax + 1;
ry += -cSim.kmax + 1;
float2 sum = make_float2(0.0f, 0.0f);
for (int atom = 0; atom < cSim.atoms; atom++)
{
cSim.pCosSinSum[index].x += cSim.pStructureFactor[atom*totalK + index].x;
cSim.pCosSinSum[index].y += cSim.pStructureFactor[atom*totalK + index].y;
}
lowrz = 1 - numRz;
}
lowry = 1 - numRy;
float2 tab_xy = (ry >= 0 ? MultofFloat2 (cSim.pEwaldEikr[atom*cSim.kmax*3 + rx*3 + 0], cSim.pEwaldEikr[atom*cSim.kmax*3 + ry*3 + 1]) :
ConjMultofFloat2 (cSim.pEwaldEikr[atom*cSim.kmax*3 + rx*3 + 0], cSim.pEwaldEikr[atom*cSim.kmax*3 - ry*3 + 1]));
float charge = cSim.pPosq[atom].w;
float2 structureFactor = (rz >= 0 ? MultofFloat2(tab_xy, cSim.pEwaldEikr[atom*cSim.kmax*3 + rz*3 + 2]) :
ConjMultofFloat2(tab_xy, cSim.pEwaldEikr[atom*cSim.kmax*3 - rz*3 + 2]));
sum.x += charge*structureFactor.x;
sum.y += charge*structureFactor.y;
}
rx += blockDim.x * gridDim.x;
cSim.pEwaldCosSinSum[index] = sum;
index += blockDim.x * gridDim.x;
}
}
__global__ void kCalculateEwaldFastForces_kernel()
......@@ -228,53 +120,41 @@ __global__ void kCalculateEwaldFastForces_kernel()
float PI = 3.14159265358979323846f;
const float epsilon = 1.0;
float recipCoeff = (4*PI/cSim.V/epsilon);
float recipCoeff = cSim.epsfac*(4*PI/cSim.cellVolume/epsilon);
int lowry = 0;
int lowrz = 1;
int numRx = 20+1;
int numRy = 20+1;
int numRz = 20+1;
unsigned int totalK = (numRx*2 - 1) * (numRy*2 - 1) * (numRz*2 - 1);
int index;
const int numRx = cSim.kmax;
const int numRy = cSim.kmax;
const int numRz = cSim.kmax;
unsigned int atom = threadIdx.x + blockIdx.x * blockDim.x;
while (atom < cSim.atoms)
{
for(int rx = 0; rx < numRx; rx++) {
float charge = cSim.pPosq[atom].w;
for (int rx = 0; rx < numRx; rx++) {
float kx = rx * cSim.recipBoxSizeX;
for(int ry = lowry; ry < numRy; ry++) {
for (int ry = lowry; ry < numRy; ry++) {
float ky = ry * cSim.recipBoxSizeY;
float2 tab_xy = (ry >= 0 ? MultofFloat2(cSim.pEwaldEikr[atom*cSim.kmax*3 + rx*3 + 0], cSim.pEwaldEikr[atom*cSim.kmax*3 + ry*3 + 1]) :
ConjMultofFloat2(cSim.pEwaldEikr[atom*cSim.kmax*3 + rx*3 + 0], cSim.pEwaldEikr[atom*cSim.kmax*3 - ry*3 + 1]));
for (int rz = lowrz; rz < numRz; rz++) {
float kz = rz * cSim.recipBoxSizeZ;
// next one is scary!
index = rx * (numRy*2 - 1 ) * (numRz*2 - 1) + (ry + numRy -1 ) * (numRz * 2 - 1) + (rz + numRz -1 );
float k2 = kx * kx + ky * ky + kz * kz;
int index = rx*(numRy*2-1)*(numRz*2-1) + (ry+numRy-1)*(numRz*2-1) + (rz+numRz-1);
float k2 = kx*kx + ky*ky + kz*kz;
float ak = exp(k2*cSim.factorEwald) / k2;
float dEdR = ak * (cSim.pCosSinSum[index].x * cSim.pStructureFactor[atom*totalK + index].y - cSim.pCosSinSum[index].y * cSim.pStructureFactor[atom*totalK + index].x);
cSim.pForce4[atom].x += 2 * recipCoeff * dEdR * kx ;
cSim.pForce4[atom].y += 2 * recipCoeff * dEdR * ky ;
cSim.pForce4[atom].z += 2 * recipCoeff * dEdR * kz ;
float2 structureFactor = (rz >= 0 ? MultofFloat2(tab_xy, cSim.pEwaldEikr[atom*cSim.kmax*3 + rz*3 + 2]) :
ConjMultofFloat2(tab_xy, cSim.pEwaldEikr[atom*cSim.kmax*3 - rz*3 + 2]));
float dEdR = ak * charge * (cSim.pEwaldCosSinSum[index].x*structureFactor.y - cSim.pEwaldCosSinSum[index].y*structureFactor.x);
cSim.pForce4[atom].x += 2 * recipCoeff * dEdR * kx;
cSim.pForce4[atom].y += 2 * recipCoeff * dEdR * ky;
cSim.pForce4[atom].z += 2 * recipCoeff * dEdR * kz;
lowrz = 1 - numRz;
}
lowry = 1 - numRy;
}
}
atom += blockDim.x * gridDim.x;
}
}
......@@ -207,22 +207,15 @@ void kCalculateCDLJForces(gpuContext gpu)
}
else
{
// Vanilla (N2) Ewald
kCalculateCDLJEwaldDirectForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
LAUNCHERROR("kCalculateCDLJEwaldDirectForces");
kCalculateCDLJEwaldReciprocalForces_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
LAUNCHERROR("kCalculateCDLJEwaldReciprocalForces");
// If using Fast Ewald, uncomment the lines below
// kCalculateEwaldFastEikr_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
// LAUNCHERROR("kCalculateEwaldFastEikr");
// kCalculateEwaldFastStructureFactors_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
// LAUNCHERROR("kCalculateEwaldFastStructureFactors_kernel");
// kCalculateEwaldFastCosSinSums_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
// LAUNCHERROR("kCalculateEwaldFastCosSinSums");
// kCalculateEwaldFastForces_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
// LAUNCHERROR("kCalculateEwaldFastForces");
kCalculateEwaldFastEikr_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
LAUNCHERROR("kCalculateEwaldFastEikr");
kCalculateEwaldFastCosSinSums_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
LAUNCHERROR("kCalculateEwaldFastCosSinSums");
kCalculateEwaldFastForces_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
LAUNCHERROR("kCalculateEwaldFastForces");
}
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment