extern "C" __global__ void updateBsplines(const real4* __restrict__ posq, real4* __restrict__ pmeBsplineTheta, int2* __restrict__ pmeAtomGridIndex, real4 periodicBoxSize, real4 invPeriodicBoxSize) { extern __shared__ real3 bsplinesCache[]; real3* data = &bsplinesCache[threadIdx.x*PME_ORDER]; const real3 scale = make_real3(RECIP(PME_ORDER-1)); for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) { real4 pos = posq[i]; pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x; pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y; pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z; real3 t = make_real3((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X, (pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y, (pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z); real3 dr = make_real3(t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z); int3 gridIndex = make_int3(((int) t.x) % GRID_SIZE_X, ((int) t.y) % GRID_SIZE_Y, ((int) t.z) % GRID_SIZE_Z); pmeAtomGridIndex[i] = make_int2(i, gridIndex.x*GRID_SIZE_Y*GRID_SIZE_Z+gridIndex.y*GRID_SIZE_Z+gridIndex.z); data[PME_ORDER-1] = make_real3(0); data[1] = dr; data[0] = make_real3(1)-dr; for (int j = 3; j < PME_ORDER; j++) { real div = RECIP(j-1); data[j-1] = div*dr*data[j-2]; for (int k = 1; k < (j-1); k++) data[j-k-1] = div*((dr+make_real3(k)) *data[j-k-2] + (make_real3(j-k)-dr)*data[j-k-1]); data[0] = div*(make_real3(1)-dr)*data[0]; } data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2]; for (int j = 1; j < (PME_ORDER-1); j++) data[PME_ORDER-j-1] = scale*((dr+make_real3(j))*data[PME_ORDER-j-2] + (make_real3(PME_ORDER-j)-dr)*data[PME_ORDER-j-1]); data[0] = scale*(make_real3(1)-dr)*data[0]; for (int j = 0; j < PME_ORDER; j++) pmeBsplineTheta[i+j*NUM_ATOMS] = make_real4(data[j].x, data[j].y, data[j].z, pos.w); // Storing the charge here improves cache coherency in the charge spreading kernel } } /** * For each grid point, find the range of sorted atoms associated with that point. */ extern "C" __global__ void findAtomRangeForGrid(int2* __restrict__ pmeAtomGridIndex, int* __restrict__ pmeAtomRange, const real4* __restrict__ posq, real4 periodicBoxSize, real4 invPeriodicBoxSize) { int start = (NUM_ATOMS*(blockIdx.x*blockDim.x+threadIdx.x))/(blockDim.x*gridDim.x); int end = (NUM_ATOMS*(blockIdx.x*blockDim.x+threadIdx.x+1))/(blockDim.x*gridDim.x); int last = (start == 0 ? -1 : pmeAtomGridIndex[start-1].y); for (int i = start; i < end; ++i) { int2 atomData = pmeAtomGridIndex[i]; int gridIndex = atomData.y; if (gridIndex != last) { for (int j = last+1; j <= gridIndex; ++j) pmeAtomRange[j] = i; last = gridIndex; } } // Fill in values beyond the last atom. if (blockIdx.x == gridDim.x-1 && threadIdx.x == blockDim.x-1) { int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z; for (int j = last+1; j <= gridSize; ++j) pmeAtomRange[j] = NUM_ATOMS; } } #define BUFFER_SIZE (PME_ORDER*PME_ORDER*PME_ORDER) extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, unsigned long long* __restrict__ pmeGrid, const real4* __restrict__ pmeBsplineTheta, real4 periodicBoxSize, real4 invPeriodicBoxSize) { int ix = threadIdx.x/(PME_ORDER*PME_ORDER); int remainder = threadIdx.x-ix*PME_ORDER*PME_ORDER; int iy = remainder/PME_ORDER; int iz = remainder-iy*PME_ORDER; __shared__ real4 theta[PME_ORDER]; __shared__ real charge[BUFFER_SIZE]; __shared__ int basex[BUFFER_SIZE]; __shared__ int basey[BUFFER_SIZE]; __shared__ int basez[BUFFER_SIZE]; if (ix < PME_ORDER) { for (int baseIndex = blockIdx.x*BUFFER_SIZE; baseIndex < NUM_ATOMS; baseIndex += gridDim.x*BUFFER_SIZE) { // Load the next block of atoms into the buffers. int atomIndex = baseIndex+threadIdx.x; if (atomIndex < NUM_ATOMS) { real4 pos = posq[atomIndex]; charge[threadIdx.x] = pos.w; pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x; pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y; pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z; basex[threadIdx.x] = (int) ((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X); basey[threadIdx.x] = (int) ((pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y); basez[threadIdx.x] = (int) ((pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z); } __syncthreads(); int lastIndex = min(BUFFER_SIZE, NUM_ATOMS-baseIndex); for (int index = 0; index < lastIndex; index++) { int atomIndex = index+baseIndex; if (threadIdx.x < PME_ORDER) theta[threadIdx.x] = pmeBsplineTheta[atomIndex+threadIdx.x*NUM_ATOMS]; __syncthreads(); real add = charge[index]*theta[ix].x*theta[iy].y*theta[iz].z; int x = basex[index]+ix; int y = basey[index]+iy; int z = basez[index]+iz; x -= (x >= GRID_SIZE_X ? GRID_SIZE_X : 0); y -= (y >= GRID_SIZE_Y ? GRID_SIZE_Y : 0); z -= (z >= GRID_SIZE_Z ? GRID_SIZE_Z : 0); #ifdef USE_DOUBLE_PRECISION atomicAdd(&pmeGrid[2*(x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z)], static_cast((long long) (add*0xFFFFFFFF))); #else atomicAdd(&pmeGrid[x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z], static_cast((long long) (add*0xFFFFFFFF))); #endif } } } } extern "C" __global__ void finishSpreadCharge(long long* __restrict__ pmeGrid) { real2* floatGrid = (real2*) pmeGrid; const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z; real scale = EPSILON_FACTOR/(real) 0xFFFFFFFF; for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x) { #ifdef USE_DOUBLE_PRECISION long long value = pmeGrid[2*index]; #else long long value = pmeGrid[index]; #endif real2 floatValue = make_real2((real) (value*scale), 0); floatGrid[index] = floatValue; } } extern "C" __global__ void reciprocalConvolution(real2* __restrict__ pmeGrid, real* __restrict__ energyBuffer, const real* __restrict__ pmeBsplineModuliX, const real* __restrict__ pmeBsplineModuliY, const real* __restrict__ pmeBsplineModuliZ, real4 periodicBoxSize, real4 invPeriodicBoxSize) { const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z; const real recipScaleFactor = RECIP(M_PI*periodicBoxSize.x*periodicBoxSize.y*periodicBoxSize.z); real energy = 0; for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x) { int kx = index/(GRID_SIZE_Y*GRID_SIZE_Z); int remainder = index-kx*GRID_SIZE_Y*GRID_SIZE_Z; int ky = remainder/GRID_SIZE_Z; int kz = remainder-ky*GRID_SIZE_Z; if (kx == 0 && ky == 0 && kz == 0) continue; int mx = (kx < (GRID_SIZE_X+1)/2) ? kx : (kx-GRID_SIZE_X); int my = (ky < (GRID_SIZE_Y+1)/2) ? ky : (ky-GRID_SIZE_Y); int mz = (kz < (GRID_SIZE_Z+1)/2) ? kz : (kz-GRID_SIZE_Z); real mhx = mx*invPeriodicBoxSize.x; real mhy = my*invPeriodicBoxSize.y; real mhz = mz*invPeriodicBoxSize.z; real bx = pmeBsplineModuliX[kx]; real by = pmeBsplineModuliY[ky]; real bz = pmeBsplineModuliZ[kz]; real2 grid = pmeGrid[index]; real m2 = mhx*mhx+mhy*mhy+mhz*mhz; real denom = m2*bx*by*bz; real eterm = recipScaleFactor*EXP(-RECIP_EXP_FACTOR*m2)/denom; pmeGrid[index] = make_real2(grid.x*eterm, grid.y*eterm); energy += eterm*(grid.x*grid.x + grid.y*grid.y); } energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += 0.5f*energy; } extern "C" __global__ void gridInterpolateForce(const real4* __restrict__ posq, unsigned long long* __restrict__ forceBuffers, const real2* __restrict__ pmeGrid, real4 periodicBoxSize, real4 invPeriodicBoxSize) { real3 data[PME_ORDER]; real3 ddata[PME_ORDER]; const real scale = RECIP(PME_ORDER-1); for (int atom = blockIdx.x*blockDim.x+threadIdx.x; atom < NUM_ATOMS; atom += blockDim.x*gridDim.x) { real3 force = make_real3(0); real4 pos = posq[atom]; pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x; pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y; pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z; real3 t = make_real3((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X, (pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y, (pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z); int3 gridIndex = make_int3(((int) t.x) % GRID_SIZE_X, ((int) t.y) % GRID_SIZE_Y, ((int) t.z) % GRID_SIZE_Z); // Since we need the full set of thetas, it's faster to compute them here than load them // from global memory. real3 dr = make_real3(t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z); data[PME_ORDER-1] = make_real3(0); data[1] = dr; data[0] = make_real3(1)-dr; for (int j = 3; j < PME_ORDER; j++) { real div = RECIP(j-1); data[j-1] = div*dr*data[j-2]; for (int k = 1; k < (j-1); k++) data[j-k-1] = div*((dr+make_real3(k))*data[j-k-2] + (make_real3(j-k)-dr)*data[j-k-1]); data[0] = div*(make_real3(1)-dr)*data[0]; } ddata[0] = -data[0]; for (int j = 1; j < PME_ORDER; j++) ddata[j] = data[j-1]-data[j]; data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2]; for (int j = 1; j < (PME_ORDER-1); j++) data[PME_ORDER-j-1] = scale*((dr+make_real3(j))*data[PME_ORDER-j-2] + (make_real3(PME_ORDER-j)-dr)*data[PME_ORDER-j-1]); data[0] = scale*(make_real3(1)-dr)*data[0]; // Compute the force on this atom. for (int ix = 0; ix < PME_ORDER; ix++) { int xbase = gridIndex.x+ix; xbase -= (xbase >= GRID_SIZE_X ? GRID_SIZE_X : 0); xbase = xbase*GRID_SIZE_Y*GRID_SIZE_Z; real dx = data[ix].x; real ddx = ddata[ix].x; for (int iy = 0; iy < PME_ORDER; iy++) { int ybase = gridIndex.y+iy; ybase -= (ybase >= GRID_SIZE_Y ? GRID_SIZE_Y : 0); ybase = xbase + ybase*GRID_SIZE_Z; real dy = data[iy].y; real ddy = ddata[iy].y; for (int iz = 0; iz < PME_ORDER; iz++) { int zindex = gridIndex.z+iz; zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0); int index = ybase + zindex; real gridvalue = pmeGrid[index].x; force.x += ddx*dy*data[iz].z*gridvalue; force.y += dx*ddy*data[iz].z*gridvalue; force.z += dx*dy*ddata[iz].z*gridvalue; } } } real q = pos.w*EPSILON_FACTOR; atomicAdd(&forceBuffers[atom], static_cast((long long) (-q*force.x*GRID_SIZE_X*invPeriodicBoxSize.x*0xFFFFFFFF))); atomicAdd(&forceBuffers[atom+PADDED_NUM_ATOMS], static_cast((long long) (-q*force.y*GRID_SIZE_Y*invPeriodicBoxSize.y*0xFFFFFFFF))); atomicAdd(&forceBuffers[atom+2*PADDED_NUM_ATOMS], static_cast((long long) (-q*force.z*GRID_SIZE_Z*invPeriodicBoxSize.z*0xFFFFFFFF))); } }