Commit 1b9ab82b authored by Rossen Apostolov's avatar Rossen Apostolov
Browse files

Real space Ewald (direct summation) is now in kCalculateCDLJForces.h,...

Real space Ewald (direct summation) is now in kCalculateCDLJForces.h, reciprocal summation is done in a separate kernel
parent d3b512ad
......@@ -108,7 +108,23 @@ void GetCalculateCDLJForcesSim(gpuContext gpu)
#include "kCalculateCDLJForces.h"
// Include version of the kernel with Ewald method
#include "kCalculateEwald.h"
// Real Space Ewald uses almost the same kernels as Periodic
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_PERIODIC
#define USE_EWALD
#define METHOD_NAME(a, b) a##EwaldDirect##b
#include "kCalculateCDLJForces.h"
#include "kFindInteractingBlocks.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##EwaldDirectByWarp##b
#include "kCalculateCDLJForces.h"
// Reciprocal Space Ewald summation is in a separate kernel
//#include "kCalculateEwaldReciprocal.h"
__global__ extern void kCalculateCDLJCutoffForces_12_kernel();
......@@ -177,10 +193,10 @@ void kCalculateCDLJForces(gpuContext gpu)
LAUNCHERROR("kCalculateCDLJPeriodicForces");
break;
case EWALD:
kFindBlockBoundsPeriodic_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
LAUNCHERROR("kFindBlockBoundsPeriodic");
kFindBlocksWithInteractionsPeriodic_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
LAUNCHERROR("kFindBlocksWithInteractionsPeriodic");
kFindBlockBoundsEwaldDirect_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
LAUNCHERROR("kFindBlockBoundsEwaldDirect");
kFindBlocksWithInteractionsEwaldDirect_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
LAUNCHERROR("kFindBlocksWithInteractionsEwaldDirect");
result = cudppCompact(gpu->cudpp, gpu->sim.pInteractingWorkUnit, gpu->sim.pInteractionCount,
gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits);
if (result != CUDPP_SUCCESS)
......@@ -190,15 +206,15 @@ void kCalculateCDLJForces(gpuContext gpu)
}
gpu->psInteractionCount->Download();
numWithInteractions = gpu->psInteractionCount->_pSysData[0];
kFindInteractionsWithinBlocksPeriodic_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
kFindInteractionsWithinBlocksEwaldDirect_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
if (gpu->bOutputBufferPerWarp)
kCalculateCDLJPeriodicByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
kCalculateCDLJEwaldDirectByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
else
kCalculateCDLJEwaldForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
kCalculateCDLJEwaldDirectForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
LAUNCHERROR("kCalculateCDLJPeriodicForces");
LAUNCHERROR("kCalculateCDLJEwaldDirectForces");
}
}
......@@ -46,6 +46,12 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni
float3* tempBuffer = (float3*) &sA[cSim.nonbond_threads_per_block];
#endif
#ifdef USE_EWALD
float alphaEwald = 3.123413;
float PI = 3.14159265358979323846f;
float SQRT_PI = sqrt(PI);
#endif
unsigned int lasty = 0xFFFFFFFF;
while (pos < end)
{
......@@ -108,7 +114,13 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni
eps = a.y * psA[j].eps;
dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
#ifdef USE_CUTOFF
#ifdef USE_EWALD
float r = sqrt(r2);
float alphaR = alphaEwald * r;
dEdR += apos.w * psA[tj].q * invR * invR * invR * (erfc(alphaR) + 2.0 * alphaR * exp ( - alphaR * alphaR) / SQRT_PI );
#else
dEdR += apos.w * psA[j].q * (invR - 2.0f * cSim.reactionFieldK * r2);
#endif
#else
dEdR += apos.w * psA[j].q * invR;
#endif
......@@ -151,7 +163,13 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni
eps = a.y * psA[j].eps;
dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
#ifdef USE_CUTOFF
#ifdef USE_EWALD
float r = sqrt(r2);
float alphaR = alphaEwald * r;
dEdR += apos.w * psA[tj].q * invR * invR * invR * (erfc(alphaR) + 2.0 * alphaR * exp ( - alphaR * alphaR) / SQRT_PI );
#else
dEdR += apos.w * psA[j].q * (invR - 2.0f * cSim.reactionFieldK * r2);
#endif
#else
dEdR += apos.w * psA[j].q * invR;
#endif
......@@ -243,7 +261,13 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni
eps = a.y * psA[tj].eps;
dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
#ifdef USE_CUTOFF
#ifdef USE_EWALD
float r = sqrt(r2);
float alphaR = alphaEwald * r;
dEdR += apos.w * psA[tj].q * invR * invR * invR * (erfc(alphaR) + 2.0 * alphaR * exp ( - alphaR * alphaR) / SQRT_PI );
#else
dEdR += apos.w * psA[tj].q * (invR - 2.0f * cSim.reactionFieldK * r2);
#endif
#else
dEdR += apos.w * psA[tj].q * invR;
#endif
......@@ -292,7 +316,13 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni
eps = a.y * psA[j].eps;
dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
#ifdef USE_CUTOFF
#ifdef USE_EWALD
float r = sqrt(r2);
float alphaR = alphaEwald * r;
dEdR += apos.w * psA[tj].q * invR * invR * invR * (erfc(alphaR) + 2.0 * alphaR * exp ( - alphaR * alphaR) / SQRT_PI );
#else
dEdR += apos.w * psA[j].q * (invR - 2.0f * cSim.reactionFieldK * r2);
#endif
#else
dEdR += apos.w * psA[j].q * invR;
#endif
......@@ -377,7 +407,13 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni
eps = a.y * psA[tj].eps;
dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
#ifdef USE_CUTOFF
#ifdef USE_EWALD
float r = sqrt(r2);
float alphaR = alphaEwald * r;
dEdR += apos.w * psA[tj].q * invR * invR * invR * (erfc(alphaR) + 2.0 * alphaR * exp ( - alphaR * alphaR) / SQRT_PI );
#else
dEdR += apos.w * psA[tj].q * (invR - 2.0f * cSim.reactionFieldK * r2);
#endif
#else
dEdR += apos.w * psA[tj].q * invR;
#endif
......
......@@ -71,17 +71,6 @@ __device__ cuComplex FloatComplexMul(float r, cuComplex a)
__global__ void kCalculateCDLJEwaldForces_kernel(unsigned int* workUnit, int numWorkUnits)
{
/*
extern __shared__ Atom sA[];
unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
int pos = warp*numWorkUnits/totalWarps;
int end = (warp+1)*numWorkUnits/totalWarps;
//###########################################################################
// EWALD RECIP SPACE
//###########################################################################
// *******************************************************************
float alphaEwald = 3.123413f;
......@@ -110,32 +99,40 @@ __global__ void kCalculateCDLJEwaldForces_kernel(unsigned int* workUnit, int nu
unsigned int numRz = 60+1;
const int kmax = 61;
cuComplex eir[kmax][numberOfAtoms][3];
cuComplex tab_xy[numberOfAtoms];
cuComplex tab_qxyz[numberOfAtoms];
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
cuComplex eir[kmax][cSim.atoms][3];
cuComplex tab_xy[cSim.atoms];
cuComplex tab_qxyz[cSim.atoms];
for(unsigned int i = 0; i < numberOfAtoms; i++) {
while (pos < cSim.atoms)
{
float4 apos = cSim.pPosq[pos];
for(unsigned int m = 0; (m < 3); m++) {
eir[0][i][m].x = 1;
eir[0][i][m].y = 0;
eir[0][pos][m].x = 1;
eir[0][pos][m].y = 0;
}
eir[1][i][0].x = cos(apos.x*recipBoxSizeX);
eir[1][i][0].y = sin(apos.x*recipBoxSizeX);
eir[1][pos][0].x = cos(apos.x*recipBoxSizeX);
eir[1][pos][0].y = sin(apos.x*recipBoxSizeX);
eir[1][i][1].x = cos(apos.y*recipBoxSizeY);
eir[1][i][1].y = sin(apos.y*recipBoxSizeY);
eir[1][pos][1].x = cos(apos.y*recipBoxSizeY);
eir[1][pos][1].y = sin(apos.y*recipBoxSizeY);
eir[1][i][2].x = cos(apos.z*recipBoxSizeZ);
eir[1][i][2].y = sin(apos.z*recipBoxSizeZ);
eir[1][pos][2].x = cos(apos.z*recipBoxSizeZ);
eir[1][pos][2].y = sin(apos.z*recipBoxSizeZ);
for(unsigned int j=2; (j<kmax); j++) {
for(unsigned int m=0; (m<3); m++) {
eir[j][i][m] = ComplexMul (eir[j-1][i][m] , eir[1][i][m]);
eir[j][pos][m] = ComplexMul (eir[j-1][pos][m] , eir[1][pos][m]);
}
}
pos += blockDim.x * gridDim.x;
}
// *******************************************************************
......@@ -153,14 +150,18 @@ __global__ void kCalculateCDLJEwaldForces_kernel(unsigned int* workUnit, int nu
float ky = ry * recipBoxSizeY;
if(ry >= 0) {
for(int n = 0; n < numberOfAtoms; n++) {
tab_xy[n] = ComplexMul (eir[rx][n][0] , eir[ry][n][1]);
while (pos < cSim.atoms)
{
tab_xy[pos] = ComplexMul (eir[rx][pos][0] , eir[ry][pos][1]);
pos += blockDim.x * gridDim.x;
}
}
else {
for(int n = 0; n < numberOfAtoms; n++) {
tab_xy[n]= ComplexConjMul (eir[rx][n][0] , (eir[-ry][n][1]));
while (pos < cSim.atoms)
{
tab_xy[pos]= ComplexConjMul (eir[rx][pos][0] , (eir[-ry][pos][1]));
pos += blockDim.x * gridDim.x;
}
}
......@@ -173,34 +174,47 @@ __global__ void kCalculateCDLJEwaldForces_kernel(unsigned int* workUnit, int nu
if( rz >= 0) {
for( int n = 0; n < numberOfAtoms; n++) {
tab_qxyz[n] = FloatComplexMul ( Charge[n] * ComplexMul (tab_xy[n] , eir[rz][n][2]));
while (pos < cSim.atoms)
{
float4 apos = cSim.pPosq[pos];
apos.w *= cSim.epsfac;
tab_qxyz[pos] = FloatComplexMul ( apos.w * ComplexMul (tab_xy[pos] , eir[rz][pos][2]));
pos += blockDim.x * gridDim.x;
}
}
else {
for( int n = 0; n < numberOfAtoms; n++) {
tab_qxyz[n] = FloatComplexMul( Charge[n] * ComplexConjMul (tab_xy[n] , conj(eir[-rz][n][2])) );
while (pos < cSim.atoms)
{
float4 apos = cSim.pPosq[pos];
apos.w *= cSim.epsfac;
tab_qxyz[pos] = FloatComplexMul( apos.w * ComplexConjMul (tab_xy[pos] , conj(eir[-rz][pos][2])) );
pos += blockDim.x * gridDim.x;
}
}
float cs = 0.0f;
float ss = 0.0f;
for( int n = 0; n < numberOfAtoms; n++) {
cs += tab_qxyz[n].x;
ss += tab_qxyz[n].y;
while (pos < cSim.atoms)
{
cs += tab_qxyz[pos].x;
ss += tab_qxyz[pos].y;
pos += blockDim.x * gridDim.x;
}
recipEnergy += ak * ( cs * cs + ss * ss);
float vir = akv * ( cs * cs + ss * ss);
for(int n = 0; n < numberOfAtoms; n++) {
float force = ak * (cs * tab_qxyz[n].y - ss * tab_qxyz[n].x);
forces[n][0] += 2.0 * recipCoeff * force * kx ;
forces[n][1] += 2.0 * recipCoeff * force * ky ;
forces[n][2] += 2.0 * recipCoeff * force * kz ;
while (pos < cSim.atoms)
{
float4 force = cSim.pForce4[pos];
float dEdR = ak * (cs * tab_qxyz[pos].y - ss * tab_qxyz[pos].x);
force.x += 2.0 * recipCoeff * dEdR * kx ;
force.y += 2.0 * recipCoeff * dEdR * ky ;
force.z += 2.0 * recipCoeff * dEdR * kz ;
}
lowrz = 1 - numRz;
......@@ -217,120 +231,4 @@ __global__ void kCalculateCDLJEwaldForces_kernel(unsigned int* workUnit, int nu
// END EWALD RECIP SPACE
//###########################################################################
while (pos < end)
{
// Extract cell coordinates from appropriate work unit
unsigned int x = workUnit[pos];
unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
bool bExclusionFlag = (x & 0x1);
x = (x >> 17) << GRIDBITS;
float4 apos; // Local atom x, y, z, q
float3 af; // Local atom fx, fy, fz
float dx;
float dy;
float dz;
float r2;
float r;
float invR;
float sig;
float sig2;
float sig6;
float eps;
float dEdR;
unsigned int tgx = threadIdx.x & (GRID - 1);
unsigned int tbx = threadIdx.x - tgx;
int tj = tgx;
Atom* psA = &sA[tbx];
unsigned int i = x + tgx;
apos = cSim.pPosq[i];
float2 a = cSim.pAttr[i];
af.x = 0.0f;
af.y = 0.0f;
af.z = 0.0f;
int j = y + tgx;
float4 temp = cSim.pPosq[j];
float2 temp1 = cSim.pAttr[j];
sA[threadIdx.x].x = temp.x;
sA[threadIdx.x].y = temp.y;
sA[threadIdx.x].z = temp.z;
sA[threadIdx.x].q = temp.w;
sA[threadIdx.x].sig = temp1.x;
sA[threadIdx.x].eps = temp1.y;
sA[threadIdx.x].fx = 0.0f;
sA[threadIdx.x].fy = 0.0f;
sA[threadIdx.x].fz = 0.0f;
apos.w *= cSim.epsfac;
// Read fixed atom data into registers and GRF
unsigned int excl = cSim.pExclusion[x * cSim.exclusionStride + y + tgx];
excl = (excl >> tgx) | (excl << (GRID - tgx));
for (unsigned int j = 0; j < GRID; j++)
{
dx = psA[tj].x - apos.x;
dy = psA[tj].y - apos.y;
dz = psA[tj].z - apos.z;
dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
r2 = dx * dx + dy * dy + dz * dz;
r = sqrt(r2);
invR = 1.0f / sqrt(r2);
sig = a.x + psA[tj].sig;
sig2 = invR * sig;
sig2 *= sig2;
sig6 = sig2 * sig2 * sig2;
eps = a.y * psA[tj].eps;
// ##### LJ
dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
// ##### SHORT RANGE EWALD
float alphaR = alphaEwald * r;
dEdR += apos.w * psA[tj].q * invR * invR * invR * (erfc(alphaR) + 2.0 * alphaR * exp ( - alphaR * alphaR) / SQRT_PI );
if (!(excl & 0x1) || r2 > cSim.nonbondedCutoffSqr)
{
dEdR = 0.0f;
}
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
psA[tj].fx += dx;
psA[tj].fy += dy;
psA[tj].fz += dz;
excl >>= 1;
tj = (tj + 1) & (GRID - 1);
}
// Write results
float4 of;
of.x = af.x;
of.y = af.y;
of.z = af.z;
of.w = 0.0f;
int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = of;
of.x = sA[threadIdx.x].fx;
of.y = sA[threadIdx.x].fy;
of.z = sA[threadIdx.x].fz;
offset = y + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = of;
pos++;
}
*/
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment