Commit 9fdfaca9 authored by Rossen Apostolov's avatar Rossen Apostolov
Browse files

More work on fast Ewald (N^3/2) in Cuda

parent 6fd7f3bb
...@@ -73,25 +73,27 @@ __global__ void kCalculateEwaldFastEikr_kernel() ...@@ -73,25 +73,27 @@ __global__ void kCalculateEwaldFastEikr_kernel()
while (atom < cSim.atoms) while (atom < cSim.atoms)
{ {
apos = cSim.pPosq[atom];
//generic form of the array //generic form of the array
// pEikr[ atomID*kmax*3 + k*3 + m] // pEikr[ atomID*kmax*3 + k*3 + m]
for(unsigned int m = 0; (m < 3); m++) {
// k = 0, explicitly // k = 0, explicitly
for(unsigned int m = 0; (m < 3); m++) {
cSim.pEikr[atom*kmax*3 + 0 + m].x = 1; cSim.pEikr[atom*kmax*3 + 0 + m].x = 1;
cSim.pEikr[atom*kmax*3 + 0 + m].y = 0; cSim.pEikr[atom*kmax*3 + 0 + m].y = 0;
}
// k = 1, explicitly // k = 1, explicitly
cSim.pEikr[atom*kmax*3 + 3 + m].x = cos(apos.x*cSim.recipBoxSizeX); cSim.pEikr[atom*kmax*3 + 3 + 0].x = cos(apos.x*cSim.recipBoxSizeX);
cSim.pEikr[atom*kmax*3 + 3 + m].y = sin(apos.x*cSim.recipBoxSizeX); cSim.pEikr[atom*kmax*3 + 3 + 0].y = sin(apos.x*cSim.recipBoxSizeX);
cSim.pEikr[atom*kmax*3 + 4 + m].x = cos(apos.y*cSim.recipBoxSizeY); cSim.pEikr[atom*kmax*3 + 3 + 1].x = cos(apos.y*cSim.recipBoxSizeY);
cSim.pEikr[atom*kmax*3 + 4 + m].y = sin(apos.y*cSim.recipBoxSizeY); cSim.pEikr[atom*kmax*3 + 3 + 1].y = sin(apos.y*cSim.recipBoxSizeY);
cSim.pEikr[atom*kmax*3 + 5 + m].x = cos(apos.z*cSim.recipBoxSizeZ); cSim.pEikr[atom*kmax*3 + 3 + 2].x = cos(apos.z*cSim.recipBoxSizeZ);
cSim.pEikr[atom*kmax*3 + 5 + m].y = sin(apos.z*cSim.recipBoxSizeZ); cSim.pEikr[atom*kmax*3 + 3 + 2].y = sin(apos.z*cSim.recipBoxSizeZ);
}
// k > 1, by recursion // k > 1, by recursion
for(unsigned int k=2; (k<kmax); k++) { for(unsigned int k=2; (k<kmax); k++) {
for(unsigned int m=0; (m<3); m++) { for(unsigned int m=0; (m<3); m++) {
...@@ -103,13 +105,12 @@ __global__ void kCalculateEwaldFastEikr_kernel() ...@@ -103,13 +105,12 @@ __global__ void kCalculateEwaldFastEikr_kernel()
} }
} }
__global__ void kCalculateEwaldFastCosSinSums_kernel() __global__ void kCalculateEwaldFastStructureFactors_kernel()
{ {
// hard-coded maximum k-vectors, no interface yet // hard-coded maximum k-vectors, no interface yet
int kmax = cSim.kmax; int kmax = cSim.kmax;
// float2 eikr;
float4 apos; float4 apos;
int lowry = 0; int lowry = 0;
int lowrz = 1; int lowrz = 1;
...@@ -125,64 +126,110 @@ __global__ void kCalculateEwaldFastCosSinSums_kernel() ...@@ -125,64 +126,110 @@ __global__ void kCalculateEwaldFastCosSinSums_kernel()
while (atom < cSim.atoms) while (atom < cSim.atoms)
{ {
apos = cSim.pPosq[atom];
apos = cSim.pPosq[atom];
// cSim.pEikr[atom*kmax*3 + k*3 + m]
for(int rx = 0; rx < numRx; rx++)
{
for(int ry = lowry; ry < numRy; ry++)
{
if(ry >= 0)
{
tab_xy = MultofFloat2 (cSim.pEikr[atom*kmax*3 + rx*3 + 0] , cSim.pEikr[atom*kmax*3 + ry*3 + 1]);
}
else
{
tab_xy = ConjMultofFloat2 (cSim.pEikr[atom*kmax*3 + rx*3 + 0] , cSim.pEikr[atom*kmax*3 - ry*3 + 1]);
}
for (int rz = lowrz; rz < numRz; rz++)
{
index = rx * (numRy*2 - 1 ) * (numRz*2 - 1) + (ry + numRy -1 ) * (numRz * 2 - 1) + (rz + numRz -1 );
if( rz >= 0)
{
cSim.pStructureFactor[atom*totalK + index] = FloatMultFloat2 ( (apos.w ), MultofFloat2 (tab_xy ,cSim.pEikr[atom*kmax*3 + rz*3 + 2] ));
}
else
{
cSim.pStructureFactor[atom*totalK + index] = FloatMultFloat2 ( ( apos.w ), ConjMultofFloat2 (tab_xy ,cSim.pEikr[atom*kmax*3 - rz*3 + 2] ));
}
cSim.pCosSinSum[index].x = 0.0;
cSim.pCosSinSum[index].y = 0.0;
lowrz = 1 - numRz;
}
lowry = 1 - numRy;
}
}
atom += blockDim.x * gridDim.x;
}
}
__global__ void kCalculateEwaldFastCosSinSums_kernel()
{
// float2 eikr;
int lowry = 0;
int lowrz = 1;
int numRx = 20+1;
int numRy = 20+1;
int numRz = 20+1;
unsigned int totalK = (numRx*2 - 1) * (numRy*2 - 1) * (numRz*2 - 1);
int index;
unsigned int rx = threadIdx.x + blockIdx.x * blockDim.x;
while (rx < numRx)
{
// ********************************************************************** // **********************************************************************
// cSim.pEikr[atom*kmax*3 + k*3 + m] // cSim.pEikr[atom*kmax*3 + k*3 + m]
for(int rx = 0; rx < numRx; rx++) { // for(int rx = 0; rx < numRx; rx++) {
for(int ry = lowry; ry < numRy; ry++) { for(int ry = lowry; ry < numRy; ry++) {
if(ry >= 0) {
// tab_xy[n] = EIR(rx, n, 0) * EIR(ry, n, 1);
tab_xy = MultofFloat2 (cSim.pEikr[atom*kmax*3 + rx*3 + 0] , cSim.pEikr[atom*kmax*3 + ry*3 + 1]);
}
else {
// tab_xy[n] = EIR(rx, n, 0) * conj (EIR(-ry, n, 1));
tab_xy = ConjMultofFloat2 (cSim.pEikr[atom*kmax*3 + rx*3 + 0] , cSim.pEikr[atom*kmax*3 - ry*3 + 1]);
}
for (int rz = lowrz; rz < numRz; rz++) { for (int rz = lowrz; rz < numRz; rz++) {
// next one is scary!
index = rx * (numRy*2 - 1 ) * (numRz*2 - 1) + (ry + numRy -1 ) * (numRz * 2 - 1) + (rz + numRz -1 ); index = rx * (numRy*2 - 1 ) * (numRz*2 - 1) + (ry + numRy -1 ) * (numRz * 2 - 1) + (rz + numRz -1 );
if( rz >= 0) { for (int atom = 0; atom < cSim.atoms; atom++)
//tab_qxyz[n] = atomParameters[n][QIndex] * (tab_xy[n] * EIR(rz, n, 2)); {
cSim.pStructureFactor[atom*totalK + index] = FloatMultFloat2 ( apos.w , MultofFloat2 (tab_xy ,cSim.pEikr[atom*kmax*3 + rz*3 + 2] )); cSim.pCosSinSum[index].x += cSim.pStructureFactor[atom*totalK + index].x;
} cSim.pCosSinSum[index].y += cSim.pStructureFactor[atom*totalK + index].y;
}
else {
// tab_qxyz[n] = atomParameters[n][QIndex] * (tab_xy[n] * conj(EIR(-rz, n, 2)));
cSim.pStructureFactor[atom*totalK + index] = FloatMultFloat2 ( apos.w , ConjMultofFloat2 (tab_xy ,cSim.pEikr[atom*kmax*3 - rz*3 + 2] ));
}
cSim.pCosSinSum[index].x += cSim.pStructureFactor[atom*totalK + index].x;
cSim.pCosSinSum[index].y += cSim.pStructureFactor[atom*totalK + index].y;
lowrz = 1 - numRz; lowrz = 1 - numRz;
} }
lowry = 1 - numRy; lowry = 1 - numRy;
} }
rx += blockDim.x * gridDim.x;
} }
// **********************************************************************
atom += blockDim.x * gridDim.x;
}
} }
__global__ void kCalculateEwaldFastForces_kernel() __global__ void kCalculateEwaldFastForces_kernel()
{ {
float PI = 3.14159265358979323846f; float PI = 3.14159265358979323846f;
// hard-coded maximum k-vectors, no interface yet
// int kmax = cSim.kmax;
const float epsilon = 1.0; const float epsilon = 1.0;
float recipCoeff = (4*PI/cSim.V/epsilon); float recipCoeff = (4*PI/cSim.V/epsilon);
// float2 eikr;
// float4 apos;
int lowry = 0; int lowry = 0;
int lowrz = 1; int lowrz = 1;
int numRx = 20+1; int numRx = 20+1;
...@@ -195,10 +242,6 @@ __global__ void kCalculateEwaldFastForces_kernel() ...@@ -195,10 +242,6 @@ __global__ void kCalculateEwaldFastForces_kernel()
while (atom < cSim.atoms) while (atom < cSim.atoms)
{ {
// apos = cSim.pPosq[atom];
// **********************************************************************
// cSim.pEikr[atom*kmax*3 + k*3 + m]
for(int rx = 0; rx < numRx; rx++) { for(int rx = 0; rx < numRx; rx++) {
...@@ -219,6 +262,7 @@ __global__ void kCalculateEwaldFastForces_kernel() ...@@ -219,6 +262,7 @@ __global__ void kCalculateEwaldFastForces_kernel()
float ak = exp(k2*cSim.factorEwald) / k2; float ak = exp(k2*cSim.factorEwald) / k2;
float dEdR = ak * (cSim.pCosSinSum[index].x * cSim.pStructureFactor[atom*totalK + index].y - cSim.pCosSinSum[index].y * cSim.pStructureFactor[atom*totalK + index].x); float dEdR = ak * (cSim.pCosSinSum[index].x * cSim.pStructureFactor[atom*totalK + index].y - cSim.pCosSinSum[index].y * cSim.pStructureFactor[atom*totalK + index].x);
cSim.pForce4[atom].x += 2 * recipCoeff * dEdR * kx ; cSim.pForce4[atom].x += 2 * recipCoeff * dEdR * kx ;
cSim.pForce4[atom].y += 2 * recipCoeff * dEdR * ky ; cSim.pForce4[atom].y += 2 * recipCoeff * dEdR * ky ;
cSim.pForce4[atom].z += 2 * recipCoeff * dEdR * kz ; cSim.pForce4[atom].z += 2 * recipCoeff * dEdR * kz ;
...@@ -228,7 +272,9 @@ __global__ void kCalculateEwaldFastForces_kernel() ...@@ -228,7 +272,9 @@ __global__ void kCalculateEwaldFastForces_kernel()
lowry = 1 - numRy; lowry = 1 - numRy;
} }
} }
// **********************************************************************
atom += blockDim.x * gridDim.x; atom += blockDim.x * gridDim.x;
} }
} }
...@@ -215,12 +215,14 @@ void kCalculateCDLJForces(gpuContext gpu) ...@@ -215,12 +215,14 @@ void kCalculateCDLJForces(gpuContext gpu)
LAUNCHERROR("kCalculateCDLJEwaldReciprocalForces"); LAUNCHERROR("kCalculateCDLJEwaldReciprocalForces");
// If using Fast Ewald, uncomment the lines below // If using Fast Ewald, uncomment the lines below
// kCalculateEwaldFastEikr_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>(); // kCalculateEwaldFastEikr_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
// LAUNCHERROR("kCalculateEwaldFastEikr"); // LAUNCHERROR("kCalculateEwaldFastEikr");
// kCalculateEwaldFastCosSinSums_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>(); // kCalculateEwaldFastStructureFactors_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
// LAUNCHERROR("kCalculateEwaldFastCosSinSums"); // LAUNCHERROR("kCalculateEwaldFastStructureFactors_kernel");
// kCalculateEwaldFastForces_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>(); // kCalculateEwaldFastCosSinSums_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
// LAUNCHERROR("kCalculateEwaldFastForces"); // LAUNCHERROR("kCalculateEwaldFastCosSinSums");
// kCalculateEwaldFastForces_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
// LAUNCHERROR("kCalculateEwaldFastForces");
} }
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment