Commit 125e52ae authored by Peter Eastman's avatar Peter Eastman
Browse files

Optimizations for PME

parent 3638f243
......@@ -359,12 +359,14 @@ struct cudaGmxSimulation {
float4* pTabulatedFunctionParams; // The min, max, and spacing for each tabulated function
float2* pEwaldCosSinSum; // Pointer to the cos/sin sums (ewald)
int3 pmeGridSize; // The dimensions of the grid for particle mesh Ewald
int3 pmeGroupSize; // The dimensions of the groups used in charge spreading for PME
cufftComplex* pPmeGrid; // Grid points for particle mesh Ewald
float* pPmeBsplineModuli[3];
float4* pPmeBsplineTheta;
float4* pPmeBsplineDtheta;
int4* pPmeParticleIndex; // The grid indices for each atom
float4* pPmeParticleFraction; // Fractional offset in the grid for each atom in all three dimensions.
int* pPmeInteractionFlags; // Flags for which groups of grid points interact with which atoms
unsigned int bonds; // Number of bonds
int4* pBondID; // Bond atom and output buffer IDs
float2* pBondParameter; // Bond parameters
......
......@@ -761,8 +761,12 @@ extern "C"
void gpuSetPMEParameters(gpuContext gpu, float alpha)
{
gpu->sim.alphaEwald = alpha;
int3 gridSize = make_int3(16, 16, 16);
int3 gridSize = make_int3(32, 32, 32);
gpu->sim.pmeGridSize = gridSize;
int3 groupSize = make_int3(2, 4, 4);
gpu->sim.pmeGroupSize = groupSize;
const int3 numGroups = make_int3((gridSize.x+groupSize.x-1)/groupSize.x, (gridSize.y+groupSize.y-1)/groupSize.y, (gridSize.z+groupSize.z-1)/groupSize.z);
const unsigned int totalGroups = numGroups.x*numGroups.y*numGroups.z;
cufftPlan3d(&gpu->fftplan, gridSize.x, gridSize.y, gridSize.z, CUFFT_C2C);
gpu->psPmeGrid = new CUDAStream<cufftComplex>(gridSize.x*gridSize.y*gridSize.z, 1, "PmeGrid");
gpu->sim.pPmeGrid = gpu->psPmeGrid->_pDevData;
......@@ -780,6 +784,8 @@ void gpuSetPMEParameters(gpuContext gpu, float alpha)
gpu->sim.pPmeParticleIndex = gpu->psPmeParticleIndex->_pDevData;
gpu->psPmeParticleFraction = new CUDAStream<float4>(gpu->natoms, 1, "PmeParticleFraction");
gpu->sim.pPmeParticleFraction = gpu->psPmeParticleFraction->_pDevData;
gpu->psPmeInteractionFlags = new CUDAStream<int>(totalGroups*(gpu->sim.paddedNumberOfAtoms/32), 1, "PmeInteractionFlags");
gpu->sim.pPmeInteractionFlags = gpu->psPmeInteractionFlags->_pDevData;
// Initialize the b-spline moduli.
......
......@@ -112,6 +112,7 @@ struct _gpuContext {
CUDAStream<float4>* psPmeBsplineDtheta;
CUDAStream<int4>* psPmeParticleIndex; // The grid indices for each atom
CUDAStream<float4>* psPmeParticleFraction; // Fractional offset in the grid for each atom in all three dimensions.
CUDAStream<int>* psPmeInteractionFlags; // Flags for which groups of grid points interact with which atoms
CUDAStream<float2>* psObcData;
CUDAStream<float>* psObcChain;
CUDAStream<float>* psBornForce;
......
......@@ -94,9 +94,6 @@ __global__ void kUpdateGridIndexAndFraction_kernel()
for (int i = tid; i < cSim.atoms; i += tnb)
{
float4 ftmp = cSim.pPosq[i];
__syncthreads();
float3 t = make_float3((ftmp.x/cSim.periodicBoxSizeX+1.0f)*cSim.pmeGridSize.x,
(ftmp.y/cSim.periodicBoxSizeY+1.0f)*cSim.pmeGridSize.y,
(ftmp.z/cSim.periodicBoxSizeZ+1.0f)*cSim.pmeGridSize.z);
......@@ -104,17 +101,53 @@ __global__ void kUpdateGridIndexAndFraction_kernel()
ftmp.x = modff(t.x, &tix.x);
ftmp.y = modff(t.y, &tix.y);
ftmp.z = modff(t.z, &tix.z);
cSim.pPmeParticleFraction[i] = ftmp;
/* avoid costly % operations if possible that is if dc_ngrid.* is pow. of 2 */
int4 itmp = make_int4(fast_mod(__float2int_rd(tix.x), cSim.pmeGridSize.x),
fast_mod(__float2int_rd(tix.y), cSim.pmeGridSize.y),
fast_mod(__float2int_rd(tix.z), cSim.pmeGridSize.z), 0);
cSim.pPmeParticleIndex[i] = itmp;
}
__syncthreads();
// Compute flags for which atoms affect which groups of grid points.
const int3 numGroups = make_int3((cSim.pmeGridSize.x+cSim.pmeGroupSize.x-1)/cSim.pmeGroupSize.x, (cSim.pmeGridSize.y+cSim.pmeGroupSize.y-1)/cSim.pmeGroupSize.y, (cSim.pmeGridSize.z+cSim.pmeGroupSize.z-1)/cSim.pmeGroupSize.z);
const unsigned int totalGroups = numGroups.x*numGroups.y*numGroups.z;
const float3 gridScale = make_float3(cSim.pmeGridSize.x/cSim.periodicBoxSizeX, cSim.pmeGridSize.y/cSim.periodicBoxSizeY, cSim.pmeGridSize.z/cSim.periodicBoxSizeZ);
for (int group = tid; group < totalGroups; group += tnb)
{
int3 gridBase;
gridBase.x = group/(numGroups.y*numGroups.z);
int remainder = group-gridBase.x*numGroups.y*numGroups.z;
gridBase.y = remainder/numGroups.z;
gridBase.z = remainder-gridBase.y*numGroups.z;
gridBase.x *= cSim.pmeGroupSize.x;
gridBase.y *= cSim.pmeGroupSize.y;
gridBase.z *= cSim.pmeGroupSize.z;
unsigned int flags = 0;
unsigned int baseIndex = group*(cSim.paddedNumberOfAtoms/32);
for (int atomBlock = 0; atomBlock < cSim.paddedNumberOfAtoms>>GRIDBITS; atomBlock++)
{
// Decide if this block actually needs to be processed.
int flagIndex = atomBlock%32;
if (flagIndex == 0)
flags = 0;
float4 boxSize = cSim.pGridBoundingBox[atomBlock];
float4 center = cSim.pGridCenter[atomBlock];
int maxx = (int) ceil((center.x+boxSize.x)*gridScale.x)+cSim.pmeGroupSize.x+PME_ORDER;
int maxy = (int) ceil((center.y+boxSize.y)*gridScale.y)+cSim.pmeGroupSize.y+PME_ORDER;
int maxz = (int) ceil((center.z+boxSize.z)*gridScale.z)+cSim.pmeGroupSize.z+PME_ORDER;
int minx = (int) floor((center.x-boxSize.x)*gridScale.x);
int miny = (int) floor((center.y-boxSize.y)*gridScale.y);
int minz = (int) floor((center.z-boxSize.z)*gridScale.z);
int x = minx+(gridBase.x-minx)%cSim.pmeGridSize.x;
int y = miny+(gridBase.y-miny)%cSim.pmeGridSize.y;
int z = minz+(gridBase.z-minz)%cSim.pmeGridSize.z;
if (maxx < x || maxy < y || maxz < z)
flags += 1<<flagIndex;
if (flagIndex == 31 || atomBlock == cSim.paddedNumberOfAtoms>>GRIDBITS)
cSim.pPmeInteractionFlags[baseIndex+atomBlock/32] = flags;
}
}
}
......@@ -140,8 +173,6 @@ __global__ void kUpdateBsplines_kernel()
float4 dr = cSim.pPmeParticleFraction[i];
__syncthreads();
data[PME_ORDER - 1] = make_float4(0.0f);
data[1] = dr;
data[0] = make_float4(1.0f) - dr;
......@@ -178,14 +209,11 @@ __global__ void kUpdateBsplines_kernel()
}
data[0] = div_o * (-dr + 1.0f) * data[0];
__syncthreads();
for (int j = 0; j < PME_ORDER; j++)
{
cSim.pPmeBsplineTheta[i + j*cSim.atoms] = data[j];
cSim.pPmeBsplineDtheta[i + j*cSim.atoms] = ddata[j];
}
__syncthreads();
}
}
......@@ -193,11 +221,9 @@ __global__ void kGridSpreadCharge_kernel()
{
extern __shared__ float atomCharge[];
int4* atomGridIndex = (int4*) &atomCharge[blockDim.x];
float4* bsplineTheta = (float4*) &atomGridIndex[blockDim.x];
const unsigned int totalWarps = gridDim.x*blockDim.x/GRID;
const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
const int3 groupDim = make_int3(4, 4, 2);
const int3 numGroups = make_int3((cSim.pmeGridSize.x+groupDim.x-1)/groupDim.x, (cSim.pmeGridSize.y+groupDim.y-1)/groupDim.y, (cSim.pmeGridSize.z+groupDim.z-1)/groupDim.z);
const int3 numGroups = make_int3((cSim.pmeGridSize.x+cSim.pmeGroupSize.x-1)/cSim.pmeGroupSize.x, (cSim.pmeGridSize.y+cSim.pmeGroupSize.y-1)/cSim.pmeGroupSize.y, (cSim.pmeGridSize.z+cSim.pmeGroupSize.z-1)/cSim.pmeGroupSize.z);
const unsigned int totalGroups = numGroups.x*numGroups.y*numGroups.z;
unsigned int group = warp*totalGroups/totalWarps;
const unsigned int end = (warp+1)*totalGroups/totalWarps;
......@@ -205,7 +231,7 @@ __global__ void kGridSpreadCharge_kernel()
while (group < end)
{
// Process a group of grid points of size groupDim. First figure out the base index for the group,
// Process a group of grid points of size cSim.pmeGroupSize. First figure out the base index for the group,
// and the index of the specific point this thread will handle.
int3 gridBase;
......@@ -213,14 +239,14 @@ __global__ void kGridSpreadCharge_kernel()
int remainder = group-gridBase.x*numGroups.y*numGroups.z;
gridBase.y = remainder/numGroups.z;
gridBase.z = remainder-gridBase.y*numGroups.z;
gridBase.x *= groupDim.x;
gridBase.y *= groupDim.y;
gridBase.z *= groupDim.z;
gridBase.x *= cSim.pmeGroupSize.x;
gridBase.y *= cSim.pmeGroupSize.y;
gridBase.z *= cSim.pmeGroupSize.z;
int3 gridPoint;
gridPoint.x = index/(groupDim.y*groupDim.z);
remainder = index-gridPoint.x*groupDim.y*groupDim.z;
gridPoint.y = remainder/groupDim.z;
gridPoint.z = remainder-gridPoint.y*groupDim.z;
gridPoint.x = index/(cSim.pmeGroupSize.y*cSim.pmeGroupSize.z);
remainder = index-gridPoint.x*cSim.pmeGroupSize.y*cSim.pmeGroupSize.z;
gridPoint.y = remainder/cSim.pmeGroupSize.z;
gridPoint.z = remainder-gridPoint.y*cSim.pmeGroupSize.z;
gridPoint.x += gridBase.x;
gridPoint.y += gridBase.y;
gridPoint.z += gridBase.z;
......@@ -228,17 +254,22 @@ __global__ void kGridSpreadCharge_kernel()
// Loop over blocks of atoms.
float result = 0.0f;
int flags = 0;
unsigned int baseIndex = group*(cSim.paddedNumberOfAtoms/32);
for (int atomBlock = 0; atomBlock < cSim.paddedNumberOfAtoms>>GRIDBITS; atomBlock++)
{
// Decide if this block actually needs to be processed.
int flagIndex = atomBlock%32;
if (flagIndex == 0)
flags = cSim.pPmeInteractionFlags[baseIndex+atomBlock/32];
if ((flags & (1<<flagIndex)) != 0)
continue;
int atomIndex = (atomBlock<<GRIDBITS)+index;
if (atomIndex < cSim.atoms)
{
atomCharge[threadIdx.x] = cSim.pPosq[atomIndex].w;
atomGridIndex[threadIdx.x] = cSim.pPmeParticleIndex[atomIndex];
// bsplineTheta[threadIdx.x] = cSim.pPmeBsplineTheta[atomIndex];
// bsplineTheta[threadIdx.x+blockDim.x] = cSim.pPmeBsplineTheta[atomIndex+cSim.atoms];
// bsplineTheta[threadIdx.x+2*blockDim.x] = cSim.pPmeBsplineTheta[atomIndex+2*cSim.atoms];
// bsplineTheta[threadIdx.x+3*blockDim.x] = cSim.pPmeBsplineTheta[atomIndex+3*cSim.atoms];
}
int maxAtoms = min(GRID, cSim.atoms-(atomBlock<<GRIDBITS));
for (int i = 0; i < maxAtoms; i++)
......@@ -248,14 +279,10 @@ __global__ void kGridSpreadCharge_kernel()
int ix = gridPoint.x-atomGridIndex[localIndex].x;
int iy = gridPoint.y-atomGridIndex[localIndex].y;
int iz = gridPoint.z-atomGridIndex[localIndex].z;
if (ix < 0)
ix += cSim.pmeGridSize.x;
if (iy < 0)
iy += cSim.pmeGridSize.y;
if (iz < 0)
iz += cSim.pmeGridSize.z;
ix += (ix < 0 ? cSim.pmeGridSize.x : 0);
iy += (iy < 0 ? cSim.pmeGridSize.y : 0);
iz += (iz < 0 ? cSim.pmeGridSize.z : 0);
if (ix < PME_ORDER && iy < PME_ORDER && iz < PME_ORDER)
// result += atomCharge[threadIdx.x-index+i]*bsplineTheta[localIndex+ix*blockDim.x].x*bsplineTheta[localIndex+iy*blockDim.x].y*bsplineTheta[localIndex+iz*blockDim.x].z;
result += atomCharge[threadIdx.x-index+i]*cSim.pPmeBsplineTheta[atomIndex+ix*cSim.atoms].x*cSim.pPmeBsplineTheta[atomIndex+iy*cSim.atoms].y*cSim.pPmeBsplineTheta[atomIndex+iz*cSim.atoms].z;
}
}
......@@ -346,7 +373,7 @@ void kCalculatePME(gpuContext gpu)
unsigned int threads = 16380/(2*PME_ORDER*sizeof(float4));
kUpdateBsplines_kernel<<<gpu->sim.blocks, threads, 2*threads*PME_ORDER*sizeof(float4)>>>();
LAUNCHERROR("kUpdateBsplines");
kGridSpreadCharge_kernel<<<gpu->sim.blocks, 64, 64*(sizeof(float)+sizeof(int4)+4*sizeof(float4))>>>();
kGridSpreadCharge_kernel<<<gpu->sim.blocks, 64, 64*(sizeof(float)+sizeof(int4))>>>();
LAUNCHERROR("kGridSpreadCharge");
cufftExecC2C(gpu->fftplan, gpu->psPmeGrid->_pDevData, gpu->psPmeGrid->_pDevData, CUFFT_FORWARD);
kReciprocalConvolution_kernel<<<gpu->sim.blocks, gpu->sim.nonbond_threads_per_block>>>();
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment