Commit 49118236 authored by Yutong Zhao's avatar Yutong Zhao
Browse files

Fixed potential energy discrepancies in large systems between CUDA and...

Fixed potential energy discrepancies in large systems between CUDA and Reference platform. The energy calculation now uses the same method as that of reciprocalConvolution so it mirrors the reference implementation. OpenCL is fine.  
parent 407c0b93
...@@ -1686,6 +1686,11 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF ...@@ -1686,6 +1686,11 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
else else
cufftExecR2C(fftForward, (float*) originalPmeGrid->getDevicePointer(), (float2*) reciprocalPmeGrid->getDevicePointer()); cufftExecR2C(fftForward, (float*) originalPmeGrid->getDevicePointer(), (float2*) reciprocalPmeGrid->getDevicePointer());
if (includeEnergy) {
void* computeEnergyArgs[] = {&reciprocalPmeGrid->getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(), &pmeBsplineModuliX->getDevicePointer(), &pmeBsplineModuliY->getDevicePointer(), &pmeBsplineModuliZ->getDevicePointer(), cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()};
cu.executeKernel(pmeEvalEnergyKernel, computeEnergyArgs, cu.getNumAtoms());
}
void* convolutionArgs[] = {&reciprocalPmeGrid->getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(), &pmeBsplineModuliX->getDevicePointer(), &pmeBsplineModuliY->getDevicePointer(), &pmeBsplineModuliZ->getDevicePointer(), cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()}; void* convolutionArgs[] = {&reciprocalPmeGrid->getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(), &pmeBsplineModuliX->getDevicePointer(), &pmeBsplineModuliY->getDevicePointer(), &pmeBsplineModuliZ->getDevicePointer(), cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()};
cu.executeKernel(pmeConvolutionKernel, convolutionArgs, cu.getNumAtoms()); cu.executeKernel(pmeConvolutionKernel, convolutionArgs, cu.getNumAtoms());
...@@ -1694,11 +1699,7 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF ...@@ -1694,11 +1699,7 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
else else
cufftExecC2R(fftBackward, (float2*) reciprocalPmeGrid->getDevicePointer(), (float*) convolvedPmeGrid->getDevicePointer()); cufftExecC2R(fftBackward, (float2*) reciprocalPmeGrid->getDevicePointer(), (float*) convolvedPmeGrid->getDevicePointer());
if (includeEnergy) {
void* computeEnergyArgs[] = {&originalPmeGrid->getDevicePointer(), &convolvedPmeGrid->getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer() };
cu.executeKernel(pmeEvalEnergyKernel, computeEnergyArgs, cu.getNumAtoms());
}
void* interpolateArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &convolvedPmeGrid->getDevicePointer(), void* interpolateArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &convolvedPmeGrid->getDevicePointer(),
cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()}; cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()};
cu.executeKernel(pmeInterpolateForceKernel, interpolateArgs, cu.getNumAtoms()); cu.executeKernel(pmeInterpolateForceKernel, interpolateArgs, cu.getNumAtoms());
......
extern "C" __global__ void updateBsplines(const real4* __restrict__ posq, real4* __restrict__ pmeBsplineTheta, int2* __restrict__ pmeAtomGridIndex, extern "C" __global__ void updateBsplines(const real4* __restrict__ posq, real4* __restrict__ pmeBsplineTheta, int2* __restrict__ pmeAtomGridIndex,
real4 periodicBoxSize, real4 invPeriodicBoxSize) { real4 periodicBoxSize, real4 invPeriodicBoxSize) {
extern __shared__ real3 bsplinesCache[]; extern __shared__ real3 bsplinesCache[];
real3* data = &bsplinesCache[threadIdx.x*PME_ORDER]; real3* data = &bsplinesCache[threadIdx.x*PME_ORDER];
const real3 scale = make_real3(RECIP(PME_ORDER-1)); const real3 scale = make_real3(RECIP(PME_ORDER-1));
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) { for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
real4 pos = posq[i]; real4 pos = posq[i];
pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x; pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y; pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z; pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
real3 t = make_real3((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X, real3 t = make_real3((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X,
(pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y, (pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y,
(pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z); (pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z);
real3 dr = make_real3(t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z); real3 dr = make_real3(t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z);
int3 gridIndex = make_int3(((int) t.x) % GRID_SIZE_X, int3 gridIndex = make_int3(((int) t.x) % GRID_SIZE_X,
((int) t.y) % GRID_SIZE_Y, ((int) t.y) % GRID_SIZE_Y,
((int) t.z) % GRID_SIZE_Z); ((int) t.z) % GRID_SIZE_Z);
pmeAtomGridIndex[i] = make_int2(i, gridIndex.x*GRID_SIZE_Y*GRID_SIZE_Z+gridIndex.y*GRID_SIZE_Z+gridIndex.z); pmeAtomGridIndex[i] = make_int2(i, gridIndex.x*GRID_SIZE_Y*GRID_SIZE_Z+gridIndex.y*GRID_SIZE_Z+gridIndex.z);
data[PME_ORDER-1] = make_real3(0); data[PME_ORDER-1] = make_real3(0);
data[1] = dr; data[1] = dr;
data[0] = make_real3(1)-dr; data[0] = make_real3(1)-dr;
for (int j = 3; j < PME_ORDER; j++) { for (int j = 3; j < PME_ORDER; j++) {
real div = RECIP(j-1); real div = RECIP(j-1);
data[j-1] = div*dr*data[j-2]; data[j-1] = div*dr*data[j-2];
for (int k = 1; k < (j-1); k++) for (int k = 1; k < (j-1); k++)
data[j-k-1] = div*((dr+make_real3(k)) *data[j-k-2] + (make_real3(j-k)-dr)*data[j-k-1]); data[j-k-1] = div*((dr+make_real3(k)) *data[j-k-2] + (make_real3(j-k)-dr)*data[j-k-1]);
data[0] = div*(make_real3(1)-dr)*data[0]; data[0] = div*(make_real3(1)-dr)*data[0];
} }
data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2]; data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
for (int j = 1; j < (PME_ORDER-1); j++) for (int j = 1; j < (PME_ORDER-1); j++)
data[PME_ORDER-j-1] = scale*((dr+make_real3(j))*data[PME_ORDER-j-2] + (make_real3(PME_ORDER-j)-dr)*data[PME_ORDER-j-1]); data[PME_ORDER-j-1] = scale*((dr+make_real3(j))*data[PME_ORDER-j-2] + (make_real3(PME_ORDER-j)-dr)*data[PME_ORDER-j-1]);
data[0] = scale*(make_real3(1)-dr)*data[0]; data[0] = scale*(make_real3(1)-dr)*data[0];
for (int j = 0; j < PME_ORDER; j++) { for (int j = 0; j < PME_ORDER; j++) {
real3 d = data[j]; // Copy it as a workaround for a bug in CUDA 5.0 real3 d = data[j]; // Copy it as a workaround for a bug in CUDA 5.0
pmeBsplineTheta[i+j*NUM_ATOMS] = make_real4(d.x, d.y, d.z, pos.w); // Storing the charge here improves cache coherency in the charge spreading kernel pmeBsplineTheta[i+j*NUM_ATOMS] = make_real4(d.x, d.y, d.z, pos.w); // Storing the charge here improves cache coherency in the charge spreading kernel
} }
} }
} }
/** /**
* For each grid point, find the range of sorted atoms associated with that point. * For each grid point, find the range of sorted atoms associated with that point.
*/ */
extern "C" __global__ void findAtomRangeForGrid(int2* __restrict__ pmeAtomGridIndex, int* __restrict__ pmeAtomRange, const real4* __restrict__ posq, real4 periodicBoxSize, real4 invPeriodicBoxSize) { extern "C" __global__ void findAtomRangeForGrid(int2* __restrict__ pmeAtomGridIndex, int* __restrict__ pmeAtomRange, const real4* __restrict__ posq, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
int start = (NUM_ATOMS*(blockIdx.x*blockDim.x+threadIdx.x))/(blockDim.x*gridDim.x); int start = (NUM_ATOMS*(blockIdx.x*blockDim.x+threadIdx.x))/(blockDim.x*gridDim.x);
int end = (NUM_ATOMS*(blockIdx.x*blockDim.x+threadIdx.x+1))/(blockDim.x*gridDim.x); int end = (NUM_ATOMS*(blockIdx.x*blockDim.x+threadIdx.x+1))/(blockDim.x*gridDim.x);
int last = (start == 0 ? -1 : pmeAtomGridIndex[start-1].y); int last = (start == 0 ? -1 : pmeAtomGridIndex[start-1].y);
for (int i = start; i < end; ++i) { for (int i = start; i < end; ++i) {
int2 atomData = pmeAtomGridIndex[i]; int2 atomData = pmeAtomGridIndex[i];
int gridIndex = atomData.y; int gridIndex = atomData.y;
if (gridIndex != last) { if (gridIndex != last) {
for (int j = last+1; j <= gridIndex; ++j) for (int j = last+1; j <= gridIndex; ++j)
pmeAtomRange[j] = i; pmeAtomRange[j] = i;
last = gridIndex; last = gridIndex;
} }
} }
// Fill in values beyond the last atom. // Fill in values beyond the last atom.
if (blockIdx.x == gridDim.x-1 && threadIdx.x == blockDim.x-1) { if (blockIdx.x == gridDim.x-1 && threadIdx.x == blockDim.x-1) {
int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z; int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
for (int j = last+1; j <= gridSize; ++j) for (int j = last+1; j <= gridSize; ++j)
pmeAtomRange[j] = NUM_ATOMS; pmeAtomRange[j] = NUM_ATOMS;
} }
} }
#define BUFFER_SIZE (PME_ORDER*PME_ORDER*PME_ORDER) #define BUFFER_SIZE (PME_ORDER*PME_ORDER*PME_ORDER)
extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, real* __restrict__ originalPmeGrid, extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, real* __restrict__ originalPmeGrid,
const real4* __restrict__ pmeBsplineTheta, real4 periodicBoxSize, real4 invPeriodicBoxSize) { const real4* __restrict__ pmeBsplineTheta, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
int ix = threadIdx.x/(PME_ORDER*PME_ORDER); int ix = threadIdx.x/(PME_ORDER*PME_ORDER);
int remainder = threadIdx.x-ix*PME_ORDER*PME_ORDER; int remainder = threadIdx.x-ix*PME_ORDER*PME_ORDER;
int iy = remainder/PME_ORDER; int iy = remainder/PME_ORDER;
int iz = remainder-iy*PME_ORDER; int iz = remainder-iy*PME_ORDER;
__shared__ real4 theta[PME_ORDER]; __shared__ real4 theta[PME_ORDER];
__shared__ real charge[BUFFER_SIZE]; __shared__ real charge[BUFFER_SIZE];
__shared__ int basex[BUFFER_SIZE]; __shared__ int basex[BUFFER_SIZE];
__shared__ int basey[BUFFER_SIZE]; __shared__ int basey[BUFFER_SIZE];
__shared__ int basez[BUFFER_SIZE]; __shared__ int basez[BUFFER_SIZE];
if (ix < PME_ORDER) { if (ix < PME_ORDER) {
for (int baseIndex = blockIdx.x*BUFFER_SIZE; baseIndex < NUM_ATOMS; baseIndex += gridDim.x*BUFFER_SIZE) { for (int baseIndex = blockIdx.x*BUFFER_SIZE; baseIndex < NUM_ATOMS; baseIndex += gridDim.x*BUFFER_SIZE) {
// Load the next block of atoms into the buffers. // Load the next block of atoms into the buffers.
int atomIndex = baseIndex+threadIdx.x; int atomIndex = baseIndex+threadIdx.x;
if (atomIndex < NUM_ATOMS) { if (atomIndex < NUM_ATOMS) {
real4 pos = posq[atomIndex]; real4 pos = posq[atomIndex];
charge[threadIdx.x] = pos.w; charge[threadIdx.x] = pos.w;
pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x; pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y; pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z; pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
basex[threadIdx.x] = (int) ((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X); basex[threadIdx.x] = (int) ((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X);
basey[threadIdx.x] = (int) ((pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y); basey[threadIdx.x] = (int) ((pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y);
basez[threadIdx.x] = (int) ((pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z); basez[threadIdx.x] = (int) ((pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z);
} }
__syncthreads(); __syncthreads();
int lastIndex = min(BUFFER_SIZE, NUM_ATOMS-baseIndex); int lastIndex = min(BUFFER_SIZE, NUM_ATOMS-baseIndex);
for (int index = 0; index < lastIndex; index++) { for (int index = 0; index < lastIndex; index++) {
int atomIndex = index+baseIndex; int atomIndex = index+baseIndex;
if (threadIdx.x < PME_ORDER) if (threadIdx.x < PME_ORDER)
theta[threadIdx.x] = pmeBsplineTheta[atomIndex+threadIdx.x*NUM_ATOMS]; theta[threadIdx.x] = pmeBsplineTheta[atomIndex+threadIdx.x*NUM_ATOMS];
__syncthreads(); __syncthreads();
real add = charge[index]*theta[ix].x*theta[iy].y*theta[iz].z; real add = charge[index]*theta[ix].x*theta[iy].y*theta[iz].z;
int x = basex[index]+ix; int x = basex[index]+ix;
int y = basey[index]+iy; int y = basey[index]+iy;
int z = basez[index]+iz; int z = basez[index]+iz;
x -= (x >= GRID_SIZE_X ? GRID_SIZE_X : 0); x -= (x >= GRID_SIZE_X ? GRID_SIZE_X : 0);
y -= (y >= GRID_SIZE_Y ? GRID_SIZE_Y : 0); y -= (y >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
z -= (z >= GRID_SIZE_Z ? GRID_SIZE_Z : 0); z -= (z >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
#ifdef USE_DOUBLE_PRECISION #ifdef USE_DOUBLE_PRECISION
unsigned long long * ulonglong_p = (unsigned long long *) originalPmeGrid; unsigned long long * ulonglong_p = (unsigned long long *) originalPmeGrid;
atomicAdd(&ulonglong_p[x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z], static_cast<unsigned long long>((long long) (add*0x100000000))); atomicAdd(&ulonglong_p[x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z], static_cast<unsigned long long>((long long) (add*0x100000000)));
#elif __CUDA_ARCH__ < 200 #elif __CUDA_ARCH__ < 200
unsigned long long * ulonglong_p = (unsigned long long *) originalPmeGrid; unsigned long long * ulonglong_p = (unsigned long long *) originalPmeGrid;
int gridIndex = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z; int gridIndex = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z;
gridIndex = (gridIndex%2 == 0 ? gridIndex/2 : (gridIndex+GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z)/2); gridIndex = (gridIndex%2 == 0 ? gridIndex/2 : (gridIndex+GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z)/2);
atomicAdd(&ulonglong_p[gridIndex], static_cast<unsigned long long>((long long) (add*0x100000000))); atomicAdd(&ulonglong_p[gridIndex], static_cast<unsigned long long>((long long) (add*0x100000000)));
#else #else
atomicAdd(&originalPmeGrid[x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z], add*EPSILON_FACTOR); atomicAdd(&originalPmeGrid[x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z], add*EPSILON_FACTOR);
#endif #endif
} }
} }
} }
} }
extern "C" __global__ void finishSpreadCharge(long long* __restrict__ originalPmeGrid) { extern "C" __global__ void finishSpreadCharge(long long* __restrict__ originalPmeGrid) {
real* floatGrid = (real*) originalPmeGrid; real* floatGrid = (real*) originalPmeGrid;
const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z; const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
real scale = EPSILON_FACTOR/(real) 0x100000000; real scale = EPSILON_FACTOR/(real) 0x100000000;
#ifdef USE_DOUBLE_PRECISION #ifdef USE_DOUBLE_PRECISION
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x) for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x)
floatGrid[index] = scale*originalPmeGrid[index]; floatGrid[index] = scale*originalPmeGrid[index];
#else #else
for (int index = 2*(blockIdx.x*blockDim.x+threadIdx.x); index < gridSize; index += 2*blockDim.x*gridDim.x) { for (int index = 2*(blockIdx.x*blockDim.x+threadIdx.x); index < gridSize; index += 2*blockDim.x*gridDim.x) {
floatGrid[index] = scale*originalPmeGrid[index/2]; floatGrid[index] = scale*originalPmeGrid[index/2];
if (index+1 < gridSize) if (index+1 < gridSize)
floatGrid[index+1] = scale*originalPmeGrid[(index+gridSize+1)/2]; floatGrid[index+1] = scale*originalPmeGrid[(index+gridSize+1)/2];
} }
#endif #endif
} }
// convolutes on the halfcomplex_pmeGrid, which is of size NX*NY*(NZ/2+1) as F(Q) is conjugate symmetric // convolutes on the halfcomplex_pmeGrid, which is of size NX*NY*(NZ/2+1) as F(Q) is conjugate symmetric
extern "C" __global__ void extern "C" __global__ void
reciprocalConvolution(real2* __restrict__ halfcomplex_pmeGrid, real* __restrict__ energyBuffer, reciprocalConvolution(real2* __restrict__ halfcomplex_pmeGrid, real* __restrict__ energyBuffer,
const real* __restrict__ pmeBsplineModuliX, const real* __restrict__ pmeBsplineModuliX,
const real* __restrict__ pmeBsplineModuliY, const real* __restrict__ pmeBsplineModuliZ, const real* __restrict__ pmeBsplineModuliY, const real* __restrict__ pmeBsplineModuliZ,
real4 periodicBoxSize, real4 invPeriodicBoxSize) { real4 periodicBoxSize, real4 invPeriodicBoxSize) {
// R2C stores into a half complex matrix where the last dimension is cut by half // R2C stores into a half complex matrix where the last dimension is cut by half
const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*(GRID_SIZE_Z/2+1); const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*(GRID_SIZE_Z/2+1);
const real recipScaleFactor = RECIP(M_PI*periodicBoxSize.x*periodicBoxSize.y*periodicBoxSize.z); const real recipScaleFactor = RECIP(M_PI*periodicBoxSize.x*periodicBoxSize.y*periodicBoxSize.z);
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x) { for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x) {
// real indices // real indices
int kx = index/(GRID_SIZE_Y*(GRID_SIZE_Z/2+1)); int kx = index/(GRID_SIZE_Y*(GRID_SIZE_Z/2+1));
int remainder = index-kx*GRID_SIZE_Y*(GRID_SIZE_Z/2+1); int remainder = index-kx*GRID_SIZE_Y*(GRID_SIZE_Z/2+1);
int ky = remainder/(GRID_SIZE_Z/2+1); int ky = remainder/(GRID_SIZE_Z/2+1);
int kz = remainder-ky*(GRID_SIZE_Z/2+1); int kz = remainder-ky*(GRID_SIZE_Z/2+1);
int mx = (kx < (GRID_SIZE_X+1)/2) ? kx : (kx-GRID_SIZE_X);
// reciprocal space indices int my = (ky < (GRID_SIZE_Y+1)/2) ? ky : (ky-GRID_SIZE_Y);
int mx = (kx < (GRID_SIZE_X+1)/2) ? kx : (kx-GRID_SIZE_X); int mz = (kz < (GRID_SIZE_Z+1)/2) ? kz : (kz-GRID_SIZE_Z);
int my = (ky < (GRID_SIZE_Y+1)/2) ? ky : (ky-GRID_SIZE_Y); real mhx = mx*invPeriodicBoxSize.x;
int mz = (kz < (GRID_SIZE_Z+1)/2) ? kz : (kz-GRID_SIZE_Z); real mhy = my*invPeriodicBoxSize.y;
real mhz = mz*invPeriodicBoxSize.z;
// find the coordinates of the reciprocal space vectors real bx = pmeBsplineModuliX[kx];
real mhx = mx*invPeriodicBoxSize.x; real by = pmeBsplineModuliY[ky];
real mhy = my*invPeriodicBoxSize.y; real bz = pmeBsplineModuliZ[kz];
real mhz = mz*invPeriodicBoxSize.z; real2 grid = halfcomplex_pmeGrid[index];
real m2 = mhx*mhx+mhy*mhy+mhz*mhz;
real bx = pmeBsplineModuliX[kx]; real denom = m2*bx*by*bz;
real by = pmeBsplineModuliY[ky]; real eterm = recipScaleFactor*EXP(-RECIP_EXP_FACTOR*m2)/denom;
real bz = pmeBsplineModuliZ[kz];
if (kx != 0 || ky != 0 || kz != 0) {
real2 grid = halfcomplex_pmeGrid[index]; halfcomplex_pmeGrid[index] = make_real2(grid.x*eterm, grid.y*eterm);
}
real m2 = mhx*mhx+mhy*mhy+mhz*mhz; }
real denom = m2*bx*by*bz; }
real eterm = recipScaleFactor*EXP(-RECIP_EXP_FACTOR*m2)/denom;
extern "C" __global__ void
if (kx != 0 || ky != 0 || kz != 0) { gridEvaluateEnergy(real2* __restrict__ halfcomplex_pmeGrid, real* __restrict__ energyBuffer,
halfcomplex_pmeGrid[index] = make_real2(grid.x*eterm, grid.y*eterm); const real* __restrict__ pmeBsplineModuliX,
} const real* __restrict__ pmeBsplineModuliY, const real* __restrict__ pmeBsplineModuliZ,
} real4 periodicBoxSize, real4 invPeriodicBoxSize) {
} // R2C stores into a half complex matrix where the last dimension is cut by half
const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
extern "C" __global__ const real recipScaleFactor = RECIP(M_PI*periodicBoxSize.x*periodicBoxSize.y*periodicBoxSize.z);
void gridEvaluateEnergy(const real* __restrict__ originalGrid, const real* __restrict__ convolvedGrid, real* __restrict__ energyBuffer) {
const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z; real energy = 0;
real energy = 0; for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x) {
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x) // real indices
energy += originalGrid[index]*convolvedGrid[index]; int kx = index/(GRID_SIZE_Y*(GRID_SIZE_Z));
energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += 0.5*energy; int remainder = index-kx*GRID_SIZE_Y*(GRID_SIZE_Z);
} int ky = remainder/(GRID_SIZE_Z);
int kz = remainder-ky*(GRID_SIZE_Z);
extern "C" __global__ int mx = (kx < (GRID_SIZE_X+1)/2) ? kx : (kx-GRID_SIZE_X);
void gridInterpolateForce(const real4* __restrict__ posq, unsigned long long* __restrict__ forceBuffers, const real* __restrict__ originalPmeGrid, int my = (ky < (GRID_SIZE_Y+1)/2) ? ky : (ky-GRID_SIZE_Y);
real4 periodicBoxSize, real4 invPeriodicBoxSize) { int mz = (kz < (GRID_SIZE_Z+1)/2) ? kz : (kz-GRID_SIZE_Z);
real3 data[PME_ORDER]; real mhx = mx*invPeriodicBoxSize.x;
real3 ddata[PME_ORDER]; real mhy = my*invPeriodicBoxSize.y;
const real scale = RECIP(PME_ORDER-1); real mhz = mz*invPeriodicBoxSize.z;
real m2 = mhx*mhx+mhy*mhy+mhz*mhz;
for (int atom = blockIdx.x*blockDim.x+threadIdx.x; atom < NUM_ATOMS; atom += blockDim.x*gridDim.x) { real bx = pmeBsplineModuliX[kx];
real3 force = make_real3(0); real by = pmeBsplineModuliY[ky];
real4 pos = posq[atom]; real bz = pmeBsplineModuliZ[kz];
pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x; real denom = m2*bx*by*bz;
pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y; real eterm = recipScaleFactor*EXP(-RECIP_EXP_FACTOR*m2)/denom;
pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
real3 t = make_real3((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X, if(kz >= (GRID_SIZE_Z/2+1)) {
(pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y, kx = ((kx == 0) ? kx : GRID_SIZE_X-kx);
(pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z); ky = ((ky == 0) ? ky : GRID_SIZE_Y-ky);
int3 gridIndex = make_int3(((int) t.x) % GRID_SIZE_X, kz = GRID_SIZE_Z-kz;
((int) t.y) % GRID_SIZE_Y, }
((int) t.z) % GRID_SIZE_Z); int indexInHalfComplexGrid = kz + ky*(GRID_SIZE_Z/2+1)+kx*(GRID_SIZE_Y*(GRID_SIZE_Z/2+1));
real2 grid = halfcomplex_pmeGrid[indexInHalfComplexGrid];
// Since we need the full set of thetas, it's faster to compute them here than load them if (kx != 0 || ky != 0 || kz != 0) {
// from global memory. energy += eterm*(grid.x*grid.x + grid.y*grid.y);
}
real3 dr = make_real3(t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z); }
data[PME_ORDER-1] = make_real3(0); energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += 0.5f*energy;
data[1] = dr; }
data[0] = make_real3(1)-dr;
extern "C" __global__
for (int j = 3; j < PME_ORDER; j++) { void gridInterpolateForce(const real4* __restrict__ posq, unsigned long long* __restrict__ forceBuffers, const real* __restrict__ originalPmeGrid,
real div = RECIP(j-1); real4 periodicBoxSize, real4 invPeriodicBoxSize) {
data[j-1] = div*dr*data[j-2]; real3 data[PME_ORDER];
for (int k = 1; k < (j-1); k++) real3 ddata[PME_ORDER];
data[j-k-1] = div*((dr+make_real3(k))*data[j-k-2] + (make_real3(j-k)-dr)*data[j-k-1]); const real scale = RECIP(PME_ORDER-1);
data[0] = div*(make_real3(1)-dr)*data[0];
} for (int atom = blockIdx.x*blockDim.x+threadIdx.x; atom < NUM_ATOMS; atom += blockDim.x*gridDim.x) {
ddata[0] = -data[0]; real3 force = make_real3(0);
real4 pos = posq[atom];
for (int j = 1; j < PME_ORDER; j++) pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
ddata[j] = data[j-1]-data[j]; pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2]; pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
real3 t = make_real3((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X,
for (int j = 1; j < (PME_ORDER-1); j++) (pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y,
data[PME_ORDER-j-1] = scale*((dr+make_real3(j))*data[PME_ORDER-j-2] + (make_real3(PME_ORDER-j)-dr)*data[PME_ORDER-j-1]); (pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z);
data[0] = scale*(make_real3(1)-dr)*data[0]; int3 gridIndex = make_int3(((int) t.x) % GRID_SIZE_X,
((int) t.y) % GRID_SIZE_Y,
// Compute the force on this atom. ((int) t.z) % GRID_SIZE_Z);
for (int ix = 0; ix < PME_ORDER; ix++) { // Since we need the full set of thetas, it's faster to compute them here than load them
int xbase = gridIndex.x+ix; // from global memory.
xbase -= (xbase >= GRID_SIZE_X ? GRID_SIZE_X : 0);
xbase = xbase*GRID_SIZE_Y*GRID_SIZE_Z; real3 dr = make_real3(t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z);
real dx = data[ix].x; data[PME_ORDER-1] = make_real3(0);
real ddx = ddata[ix].x; data[1] = dr;
data[0] = make_real3(1)-dr;
for (int iy = 0; iy < PME_ORDER; iy++) {
int ybase = gridIndex.y+iy; for (int j = 3; j < PME_ORDER; j++) {
ybase -= (ybase >= GRID_SIZE_Y ? GRID_SIZE_Y : 0); real div = RECIP(j-1);
ybase = xbase + ybase*GRID_SIZE_Z; data[j-1] = div*dr*data[j-2];
real dy = data[iy].y; for (int k = 1; k < (j-1); k++)
real ddy = ddata[iy].y; data[j-k-1] = div*((dr+make_real3(k))*data[j-k-2] + (make_real3(j-k)-dr)*data[j-k-1]);
data[0] = div*(make_real3(1)-dr)*data[0];
for (int iz = 0; iz < PME_ORDER; iz++) { }
int zindex = gridIndex.z+iz; ddata[0] = -data[0];
zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
int index = ybase + zindex; for (int j = 1; j < PME_ORDER; j++)
real gridvalue = originalPmeGrid[index]; ddata[j] = data[j-1]-data[j];
force.x += ddx*dy*data[iz].z*gridvalue; data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
force.y += dx*ddy*data[iz].z*gridvalue;
force.z += dx*dy*ddata[iz].z*gridvalue; for (int j = 1; j < (PME_ORDER-1); j++)
} data[PME_ORDER-j-1] = scale*((dr+make_real3(j))*data[PME_ORDER-j-2] + (make_real3(PME_ORDER-j)-dr)*data[PME_ORDER-j-1]);
} data[0] = scale*(make_real3(1)-dr)*data[0];
}
real q = pos.w*EPSILON_FACTOR; // Compute the force on this atom.
forceBuffers[atom] += static_cast<unsigned long long>((long long) (-q*force.x*GRID_SIZE_X*invPeriodicBoxSize.x*0x100000000));
forceBuffers[atom+PADDED_NUM_ATOMS] += static_cast<unsigned long long>((long long) (-q*force.y*GRID_SIZE_Y*invPeriodicBoxSize.y*0x100000000)); for (int ix = 0; ix < PME_ORDER; ix++) {
forceBuffers[atom+2*PADDED_NUM_ATOMS] += static_cast<unsigned long long>((long long) (-q*force.z*GRID_SIZE_Z*invPeriodicBoxSize.z*0x100000000)); int xbase = gridIndex.x+ix;
} xbase -= (xbase >= GRID_SIZE_X ? GRID_SIZE_X : 0);
} xbase = xbase*GRID_SIZE_Y*GRID_SIZE_Z;
real dx = data[ix].x;
real ddx = ddata[ix].x;
for (int iy = 0; iy < PME_ORDER; iy++) {
int ybase = gridIndex.y+iy;
ybase -= (ybase >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
ybase = xbase + ybase*GRID_SIZE_Z;
real dy = data[iy].y;
real ddy = ddata[iy].y;
for (int iz = 0; iz < PME_ORDER; iz++) {
int zindex = gridIndex.z+iz;
zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
int index = ybase + zindex;
real gridvalue = originalPmeGrid[index];
force.x += ddx*dy*data[iz].z*gridvalue;
force.y += dx*ddy*data[iz].z*gridvalue;
force.z += dx*dy*ddata[iz].z*gridvalue;
}
}
}
real q = pos.w*EPSILON_FACTOR;
forceBuffers[atom] += static_cast<unsigned long long>((long long) (-q*force.x*GRID_SIZE_X*invPeriodicBoxSize.x*0x100000000));
forceBuffers[atom+PADDED_NUM_ATOMS] += static_cast<unsigned long long>((long long) (-q*force.y*GRID_SIZE_Y*invPeriodicBoxSize.y*0x100000000));
forceBuffers[atom+2*PADDED_NUM_ATOMS] += static_cast<unsigned long long>((long long) (-q*force.z*GRID_SIZE_Z*invPeriodicBoxSize.z*0x100000000));
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment