Commit 235a88e5 authored by Peter Eastman's avatar Peter Eastman
Browse files

Minor cleanup to PME

parent b27a0bd6
...@@ -1490,6 +1490,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon ...@@ -1490,6 +1490,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
hasInitializedFFT = true; hasInitializedFFT = true;
// Initialize the b-spline moduli. // Initialize the b-spline moduli.
int maxSize = max(max(gridSizeX, gridSizeY), gridSizeZ); int maxSize = max(max(gridSizeX, gridSizeY), gridSizeZ);
vector<double> data(PmeOrder); vector<double> data(PmeOrder);
vector<double> ddata(PmeOrder); vector<double> ddata(PmeOrder);
...@@ -1601,7 +1602,7 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF ...@@ -1601,7 +1602,7 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
void* forcesArgs[] = {&cu.getForce().getDevicePointer(), &cu.getPosq().getDevicePointer(), &cosSinSums->getDevicePointer(), cu.getPeriodicBoxSizePointer()}; void* forcesArgs[] = {&cu.getForce().getDevicePointer(), &cu.getPosq().getDevicePointer(), &cosSinSums->getDevicePointer(), cu.getPeriodicBoxSizePointer()};
cu.executeKernel(ewaldForcesKernel, forcesArgs, cu.getNumAtoms()); cu.executeKernel(ewaldForcesKernel, forcesArgs, cu.getNumAtoms());
} }
if (convolvedPmeGrid != NULL && originalPmeGrid != NULL && reciprocalPmeGrid != NULL && cu.getContextIndex() == 0 && includeReciprocal) { if (originalPmeGrid != NULL && cu.getContextIndex() == 0 && includeReciprocal) {
void* bsplinesArgs[] = {&cu.getPosq().getDevicePointer(), &pmeBsplineTheta->getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(), void* bsplinesArgs[] = {&cu.getPosq().getDevicePointer(), &pmeBsplineTheta->getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(),
cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()}; cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()};
int bsplinesSharedSize = cu.ThreadBlockSize*PmeOrder*(cu.getUseDoublePrecision() ? sizeof(double4) : sizeof(float4)); int bsplinesSharedSize = cu.ThreadBlockSize*PmeOrder*(cu.getUseDoublePrecision() ? sizeof(double4) : sizeof(float4));
...@@ -1635,8 +1636,10 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF ...@@ -1635,8 +1636,10 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
else else
cufftExecC2R(fftBackward, (float2*) reciprocalPmeGrid->getDevicePointer(), (float*) convolvedPmeGrid->getDevicePointer()); cufftExecC2R(fftBackward, (float2*) reciprocalPmeGrid->getDevicePointer(), (float*) convolvedPmeGrid->getDevicePointer());
if (includeEnergy) {
void* computeEnergyArgs[] = {&originalPmeGrid->getDevicePointer(), &convolvedPmeGrid->getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer() }; void* computeEnergyArgs[] = {&originalPmeGrid->getDevicePointer(), &convolvedPmeGrid->getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer() };
cu.executeKernel(pmeEvalEnergyKernel, computeEnergyArgs, cu.getNumAtoms()); cu.executeKernel(pmeEvalEnergyKernel, computeEnergyArgs, cu.getNumAtoms());
}
void* interpolateArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &convolvedPmeGrid->getDevicePointer(), void* interpolateArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &convolvedPmeGrid->getDevicePointer(),
cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()}; cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()};
......
...@@ -595,13 +595,9 @@ private: ...@@ -595,13 +595,9 @@ private:
CudaArray* sigmaEpsilon; CudaArray* sigmaEpsilon;
CudaArray* exceptionParams; CudaArray* exceptionParams;
CudaArray* cosSinSums; CudaArray* cosSinSums;
//TODO: separate into realpmeGrid, complex pmegrid, and resultpmeGrid
CudaArray* originalPmeGrid; CudaArray* originalPmeGrid;
CudaArray* reciprocalPmeGrid; CudaArray* reciprocalPmeGrid;
CudaArray* convolvedPmeGrid; CudaArray* convolvedPmeGrid;
CudaArray* pmeBsplineModuliX; CudaArray* pmeBsplineModuliX;
CudaArray* pmeBsplineModuliY; CudaArray* pmeBsplineModuliY;
CudaArray* pmeBsplineModuliZ; CudaArray* pmeBsplineModuliZ;
...@@ -610,10 +606,8 @@ private: ...@@ -610,10 +606,8 @@ private:
CudaArray* pmeAtomRange; CudaArray* pmeAtomRange;
CudaArray* pmeAtomGridIndex; CudaArray* pmeAtomGridIndex;
CudaSort* sort; CudaSort* sort;
cufftHandle fftForward; cufftHandle fftForward;
cufftHandle fftBackward; cufftHandle fftBackward;
CUfunction ewaldSumsKernel; CUfunction ewaldSumsKernel;
CUfunction ewaldForcesKernel; CUfunction ewaldForcesKernel;
CUfunction pmeGridIndexKernel; CUfunction pmeGridIndexKernel;
...@@ -622,9 +616,7 @@ private: ...@@ -622,9 +616,7 @@ private:
CUfunction pmeUpdateBsplinesKernel; CUfunction pmeUpdateBsplinesKernel;
CUfunction pmeSpreadChargeKernel; CUfunction pmeSpreadChargeKernel;
CUfunction pmeFinishSpreadChargeKernel; CUfunction pmeFinishSpreadChargeKernel;
/* TESTING ENERGY KERNEL */
CUfunction pmeEvalEnergyKernel; CUfunction pmeEvalEnergyKernel;
CUfunction pmeConvolutionKernel; CUfunction pmeConvolutionKernel;
CUfunction pmeInterpolateForceKernel; CUfunction pmeInterpolateForceKernel;
std::map<std::string, std::string> pmeDefines; std::map<std::string, std::string> pmeDefines;
......
...@@ -53,6 +53,7 @@ extern "C" __global__ void findAtomRangeForGrid(int2* __restrict__ pmeAtomGridIn ...@@ -53,6 +53,7 @@ extern "C" __global__ void findAtomRangeForGrid(int2* __restrict__ pmeAtomGridIn
} }
// Fill in values beyond the last atom. // Fill in values beyond the last atom.
if (blockIdx.x == gridDim.x-1 && threadIdx.x == blockDim.x-1) { if (blockIdx.x == gridDim.x-1 && threadIdx.x == blockDim.x-1) {
int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z; int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
for (int j = last+1; j <= gridSize; ++j) for (int j = last+1; j <= gridSize; ++j)
...@@ -139,13 +140,11 @@ reciprocalConvolution(real2* __restrict__ halfcomplex_pmeGrid, real* __restrict_ ...@@ -139,13 +140,11 @@ reciprocalConvolution(real2* __restrict__ halfcomplex_pmeGrid, real* __restrict_
const real* __restrict__ pmeBsplineModuliX, const real* __restrict__ pmeBsplineModuliX,
const real* __restrict__ pmeBsplineModuliY, const real* __restrict__ pmeBsplineModuliZ, const real* __restrict__ pmeBsplineModuliY, const real* __restrict__ pmeBsplineModuliZ,
real4 periodicBoxSize, real4 invPeriodicBoxSize) { real4 periodicBoxSize, real4 invPeriodicBoxSize) {
// R2C stores into a half complex matrix where the last dimension is cut by half
//R2C stores into a half complex matrix where the last dimension is cut by half
const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*(GRID_SIZE_Z/2+1); const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*(GRID_SIZE_Z/2+1);
const real recipScaleFactor = RECIP(M_PI*periodicBoxSize.x*periodicBoxSize.y*periodicBoxSize.z); const real recipScaleFactor = RECIP(M_PI*periodicBoxSize.x*periodicBoxSize.y*periodicBoxSize.z);
for( int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x ) { for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x) {
// real indices // real indices
int kx = index/(GRID_SIZE_Y*(GRID_SIZE_Z/2+1)); int kx = index/(GRID_SIZE_Y*(GRID_SIZE_Z/2+1));
int remainder = index-kx*GRID_SIZE_Y*(GRID_SIZE_Z/2+1); int remainder = index-kx*GRID_SIZE_Y*(GRID_SIZE_Z/2+1);
...@@ -180,14 +179,11 @@ reciprocalConvolution(real2* __restrict__ halfcomplex_pmeGrid, real* __restrict_ ...@@ -180,14 +179,11 @@ reciprocalConvolution(real2* __restrict__ halfcomplex_pmeGrid, real* __restrict_
} }
extern "C" __global__ extern "C" __global__
void gridEvaluateEnergy(const real * __restrict__ originalGrid, const real * __restrict convolvedGrid, real * __restrict__ energyBuffer) { void gridEvaluateEnergy(const real* __restrict__ originalGrid, const real* __restrict__ convolvedGrid, real* __restrict__ energyBuffer) {
const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z; const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
real energy = 0; real energy = 0;
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x)
for( int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x ) {
energy += originalGrid[index]*convolvedGrid[index]; energy += originalGrid[index]*convolvedGrid[index];
}
energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += 0.5*energy; energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += 0.5*energy;
} }
...@@ -213,16 +209,15 @@ void gridInterpolateForce(const real4* __restrict__ posq, unsigned long long* __ ...@@ -213,16 +209,15 @@ void gridInterpolateForce(const real4* __restrict__ posq, unsigned long long* __
// Since we need the full set of thetas, it's faster to compute them here than load them // Since we need the full set of thetas, it's faster to compute them here than load them
// from global memory. // from global memory.
real3 dr = make_real3(t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z); real3 dr = make_real3(t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z);
data[PME_ORDER-1] = make_real3(0); data[PME_ORDER-1] = make_real3(0);
data[1] = dr; data[1] = dr;
data[0] = make_real3(1)-dr; data[0] = make_real3(1)-dr;
for (int j = 3; j < PME_ORDER; j++) { for (int j = 3; j < PME_ORDER; j++) {
real div = RECIP(j-1); real div = RECIP(j-1);
data[j-1] = div*dr*data[j-2]; data[j-1] = div*dr*data[j-2];
for (int k = 1; k < (j-1); k++) for (int k = 1; k < (j-1); k++)
data[j-k-1] = div*((dr+make_real3(k))*data[j-k-2] + (make_real3(j-k)-dr)*data[j-k-1]); data[j-k-1] = div*((dr+make_real3(k))*data[j-k-2] + (make_real3(j-k)-dr)*data[j-k-1]);
data[0] = div*(make_real3(1)-dr)*data[0]; data[0] = div*(make_real3(1)-dr)*data[0];
...@@ -233,7 +228,6 @@ void gridInterpolateForce(const real4* __restrict__ posq, unsigned long long* __ ...@@ -233,7 +228,6 @@ void gridInterpolateForce(const real4* __restrict__ posq, unsigned long long* __
ddata[j] = data[j-1]-data[j]; ddata[j] = data[j-1]-data[j];
data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2]; data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
for (int j = 1; j < (PME_ORDER-1); j++) for (int j = 1; j < (PME_ORDER-1); j++)
data[PME_ORDER-j-1] = scale*((dr+make_real3(j))*data[PME_ORDER-j-2] + (make_real3(PME_ORDER-j)-dr)*data[PME_ORDER-j-1]); data[PME_ORDER-j-1] = scale*((dr+make_real3(j))*data[PME_ORDER-j-2] + (make_real3(PME_ORDER-j)-dr)*data[PME_ORDER-j-1]);
data[0] = scale*(make_real3(1)-dr)*data[0]; data[0] = scale*(make_real3(1)-dr)*data[0];
...@@ -266,7 +260,6 @@ void gridInterpolateForce(const real4* __restrict__ posq, unsigned long long* __ ...@@ -266,7 +260,6 @@ void gridInterpolateForce(const real4* __restrict__ posq, unsigned long long* __
} }
} }
real q = pos.w*EPSILON_FACTOR; real q = pos.w*EPSILON_FACTOR;
forceBuffers[atom] += static_cast<unsigned long long>((long long) (-q*force.x*GRID_SIZE_X*invPeriodicBoxSize.x*0xFFFFFFFF)); forceBuffers[atom] += static_cast<unsigned long long>((long long) (-q*force.x*GRID_SIZE_X*invPeriodicBoxSize.x*0xFFFFFFFF));
forceBuffers[atom+PADDED_NUM_ATOMS] += static_cast<unsigned long long>((long long) (-q*force.y*GRID_SIZE_Y*invPeriodicBoxSize.y*0xFFFFFFFF)); forceBuffers[atom+PADDED_NUM_ATOMS] += static_cast<unsigned long long>((long long) (-q*force.y*GRID_SIZE_Y*invPeriodicBoxSize.y*0xFFFFFFFF));
forceBuffers[atom+2*PADDED_NUM_ATOMS] += static_cast<unsigned long long>((long long) (-q*force.z*GRID_SIZE_Z*invPeriodicBoxSize.z*0xFFFFFFFF)); forceBuffers[atom+2*PADDED_NUM_ATOMS] += static_cast<unsigned long long>((long long) (-q*force.z*GRID_SIZE_Z*invPeriodicBoxSize.z*0xFFFFFFFF));
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment