Commit b27a0bd6 authored by Yutong Zhao's avatar Yutong Zhao
Browse files

Committing FFTWs to work with complex conjugate arrays. Decoupled energy and...

Committing FFTWs to work with complex conjugate arrays. Decoupled energy and force calculation into two separate kernels.
parent 32d3f86d
...@@ -1282,8 +1282,12 @@ CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() { ...@@ -1282,8 +1282,12 @@ CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() {
delete exceptionParams; delete exceptionParams;
if (cosSinSums != NULL) if (cosSinSums != NULL)
delete cosSinSums; delete cosSinSums;
if (pmeGrid != NULL) if (originalPmeGrid != NULL)
delete pmeGrid; delete originalPmeGrid;
if (reciprocalPmeGrid != NULL)
delete reciprocalPmeGrid;
if (convolvedPmeGrid != NULL)
delete convolvedPmeGrid;
if (pmeBsplineModuliX != NULL) if (pmeBsplineModuliX != NULL)
delete pmeBsplineModuliX; delete pmeBsplineModuliX;
if (pmeBsplineModuliY != NULL) if (pmeBsplineModuliY != NULL)
...@@ -1300,8 +1304,10 @@ CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() { ...@@ -1300,8 +1304,10 @@ CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() {
delete pmeAtomGridIndex; delete pmeAtomGridIndex;
if (sort != NULL) if (sort != NULL)
delete sort; delete sort;
if (hasInitializedFFT) if (hasInitializedFFT) {
cufftDestroy(fft); cufftDestroy(fftForward);
cufftDestroy(fftBackward);
}
} }
/** /**
...@@ -1422,10 +1428,12 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon ...@@ -1422,10 +1428,12 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
// Compute the PME parameters. // Compute the PME parameters.
int gridSizeX, gridSizeY, gridSizeZ; int gridSizeX, gridSizeY, gridSizeZ;
NonbondedForceImpl::calcPMEParameters(system, force, alpha, gridSizeX, gridSizeY, gridSizeZ); NonbondedForceImpl::calcPMEParameters(system, force, alpha, gridSizeX, gridSizeY, gridSizeZ);
gridSizeX = findFFTDimension(gridSizeX); gridSizeX = findFFTDimension(gridSizeX);
gridSizeY = findFFTDimension(gridSizeY); gridSizeY = findFFTDimension(gridSizeY);
gridSizeZ = findFFTDimension(gridSizeZ); gridSizeZ = findFFTDimension(gridSizeZ);
defines["EWALD_ALPHA"] = cu.doubleToString(alpha); defines["EWALD_ALPHA"] = cu.doubleToString(alpha);
defines["TWO_OVER_SQRT_PI"] = cu.doubleToString(2.0/sqrt(M_PI)); defines["TWO_OVER_SQRT_PI"] = cu.doubleToString(2.0/sqrt(M_PI));
defines["USE_EWALD"] = "1"; defines["USE_EWALD"] = "1";
...@@ -1447,14 +1455,20 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon ...@@ -1447,14 +1455,20 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
pmeSpreadChargeKernel = cu.getKernel(module, "gridSpreadCharge"); pmeSpreadChargeKernel = cu.getKernel(module, "gridSpreadCharge");
pmeConvolutionKernel = cu.getKernel(module, "reciprocalConvolution"); pmeConvolutionKernel = cu.getKernel(module, "reciprocalConvolution");
pmeInterpolateForceKernel = cu.getKernel(module, "gridInterpolateForce"); pmeInterpolateForceKernel = cu.getKernel(module, "gridInterpolateForce");
pmeEvalEnergyKernel = cu.getKernel(module, "gridEvaluateEnergy");
pmeFinishSpreadChargeKernel = cu.getKernel(module, "finishSpreadCharge"); pmeFinishSpreadChargeKernel = cu.getKernel(module, "finishSpreadCharge");
cuFuncSetCacheConfig(pmeInterpolateForceKernel, CU_FUNC_CACHE_PREFER_L1); cuFuncSetCacheConfig(pmeInterpolateForceKernel, CU_FUNC_CACHE_PREFER_L1);
// Create required data structures. // Create required data structures.
int elementSize = (cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float)); int elementSize = (cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
pmeGrid = new CudaArray(cu, gridSizeX*gridSizeY*gridSizeZ, 2*elementSize, "pmeGrid");
cu.addAutoclearBuffer(*pmeGrid); originalPmeGrid = new CudaArray(cu, gridSizeX*gridSizeY*gridSizeZ, cu.getComputeCapability() >= 2.0 ? elementSize : sizeof(long long), "originalPmeGrid");
convolvedPmeGrid = new CudaArray(cu, gridSizeX*gridSizeY*gridSizeZ, elementSize, "convolvedPmeGrid");
reciprocalPmeGrid = new CudaArray(cu, gridSizeX*gridSizeY*(gridSizeZ/2+1), 2*elementSize, "reciprocalPmeGrid");
cu.addAutoclearBuffer(*originalPmeGrid);
pmeBsplineModuliX = new CudaArray(cu, gridSizeX, elementSize, "pmeBsplineModuliX"); pmeBsplineModuliX = new CudaArray(cu, gridSizeX, elementSize, "pmeBsplineModuliX");
pmeBsplineModuliY = new CudaArray(cu, gridSizeY, elementSize, "pmeBsplineModuliY"); pmeBsplineModuliY = new CudaArray(cu, gridSizeY, elementSize, "pmeBsplineModuliY");
pmeBsplineModuliZ = new CudaArray(cu, gridSizeZ, elementSize, "pmeBsplineModuliZ"); pmeBsplineModuliZ = new CudaArray(cu, gridSizeZ, elementSize, "pmeBsplineModuliZ");
...@@ -1462,13 +1476,20 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon ...@@ -1462,13 +1476,20 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
pmeAtomRange = CudaArray::create<int>(cu, gridSizeX*gridSizeY*gridSizeZ+1, "pmeAtomRange"); pmeAtomRange = CudaArray::create<int>(cu, gridSizeX*gridSizeY*gridSizeZ+1, "pmeAtomRange");
pmeAtomGridIndex = CudaArray::create<int2>(cu, numParticles, "pmeAtomGridIndex"); pmeAtomGridIndex = CudaArray::create<int2>(cu, numParticles, "pmeAtomGridIndex");
sort = new CudaSort(cu, new SortTrait(), cu.getNumAtoms()); sort = new CudaSort(cu, new SortTrait(), cu.getNumAtoms());
cufftResult result = cufftPlan3d(&fft, gridSizeX, gridSizeY, gridSizeZ, cu.getUseDoublePrecision() ? CUFFT_Z2Z : CUFFT_C2C);
cufftResult result = cufftPlan3d(&fftForward, gridSizeX, gridSizeY, gridSizeZ, cu.getUseDoublePrecision() ? CUFFT_D2Z : CUFFT_R2C);
if (result != CUFFT_SUCCESS) if (result != CUFFT_SUCCESS)
throw OpenMMException("Error initializing FFT: "+cu.intToString(result)); throw OpenMMException("Error initializing FFT: "+cu.intToString(result));
result = cufftPlan3d(&fftBackward, gridSizeX, gridSizeY, gridSizeZ, cu.getUseDoublePrecision() ? CUFFT_Z2D : CUFFT_C2R);
if (result != CUFFT_SUCCESS)
throw OpenMMException("Error initializing FFT: "+cu.intToString(result));
cufftSetCompatibilityMode(fftForward, CUFFT_COMPATIBILITY_NATIVE);
cufftSetCompatibilityMode(fftBackward, CUFFT_COMPATIBILITY_NATIVE);
hasInitializedFFT = true; hasInitializedFFT = true;
// Initialize the b-spline moduli. // Initialize the b-spline moduli.
int maxSize = max(max(gridSizeX, gridSizeY), gridSizeZ); int maxSize = max(max(gridSizeX, gridSizeY), gridSizeZ);
vector<double> data(PmeOrder); vector<double> data(PmeOrder);
vector<double> ddata(PmeOrder); vector<double> ddata(PmeOrder);
...@@ -1542,7 +1563,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon ...@@ -1542,7 +1563,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
ewaldSelfEnergy = 0.0; ewaldSelfEnergy = 0.0;
// Add the interaction to the default nonbonded kernel. // Add the interaction to the default nonbonded kernel.
string source = cu.replaceStrings(CudaKernelSources::coulombLennardJones, defines); string source = cu.replaceStrings(CudaKernelSources::coulombLennardJones, defines);
cu.getNonbondedUtilities().addInteraction(useCutoff, usePeriodic, true, force.getCutoffDistance(), exclusionList, source, force.getForceGroup()); cu.getNonbondedUtilities().addInteraction(useCutoff, usePeriodic, true, force.getCutoffDistance(), exclusionList, source, force.getForceGroup());
if (hasLJ) if (hasLJ)
...@@ -1580,34 +1601,47 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF ...@@ -1580,34 +1601,47 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
void* forcesArgs[] = {&cu.getForce().getDevicePointer(), &cu.getPosq().getDevicePointer(), &cosSinSums->getDevicePointer(), cu.getPeriodicBoxSizePointer()}; void* forcesArgs[] = {&cu.getForce().getDevicePointer(), &cu.getPosq().getDevicePointer(), &cosSinSums->getDevicePointer(), cu.getPeriodicBoxSizePointer()};
cu.executeKernel(ewaldForcesKernel, forcesArgs, cu.getNumAtoms()); cu.executeKernel(ewaldForcesKernel, forcesArgs, cu.getNumAtoms());
} }
if (pmeGrid != NULL && cu.getContextIndex() == 0 && includeReciprocal) { if (convolvedPmeGrid != NULL && originalPmeGrid != NULL && reciprocalPmeGrid != NULL && cu.getContextIndex() == 0 && includeReciprocal) {
void* bsplinesArgs[] = {&cu.getPosq().getDevicePointer(), &pmeBsplineTheta->getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(), void* bsplinesArgs[] = {&cu.getPosq().getDevicePointer(), &pmeBsplineTheta->getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(),
cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()}; cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()};
int bsplinesSharedSize = cu.ThreadBlockSize*PmeOrder*(cu.getUseDoublePrecision() ? sizeof(double4) : sizeof(float4)); int bsplinesSharedSize = cu.ThreadBlockSize*PmeOrder*(cu.getUseDoublePrecision() ? sizeof(double4) : sizeof(float4));
cu.executeKernel(pmeUpdateBsplinesKernel, bsplinesArgs, cu.getNumAtoms(), cu.ThreadBlockSize, bsplinesSharedSize); cu.executeKernel(pmeUpdateBsplinesKernel, bsplinesArgs, cu.getNumAtoms(), cu.ThreadBlockSize, bsplinesSharedSize);
sort->sort(*pmeAtomGridIndex); sort->sort(*pmeAtomGridIndex);
void* rangeArgs[] = {&pmeAtomGridIndex->getDevicePointer(), &pmeAtomRange->getDevicePointer(), &cu.getPosq().getDevicePointer(), void* rangeArgs[] = {&pmeAtomGridIndex->getDevicePointer(), &pmeAtomRange->getDevicePointer(), &cu.getPosq().getDevicePointer(),
cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()}; cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()};
cu.executeKernel(pmeAtomRangeKernel, rangeArgs, cu.getNumAtoms()); cu.executeKernel(pmeAtomRangeKernel, rangeArgs, cu.getNumAtoms());
void* spreadArgs[] = {&cu.getPosq().getDevicePointer(), &pmeGrid->getDevicePointer(), &pmeBsplineTheta->getDevicePointer(),
cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()}; void* spreadArgs[] = {&cu.getPosq().getDevicePointer(), &originalPmeGrid->getDevicePointer(), &pmeBsplineTheta->getDevicePointer(), cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()};
cu.executeKernel(pmeSpreadChargeKernel, spreadArgs, cu.getNumAtoms(), PmeOrder*PmeOrder*PmeOrder); cu.executeKernel(pmeSpreadChargeKernel, spreadArgs, cu.getNumAtoms(), PmeOrder*PmeOrder*PmeOrder);
void* finishSpreadArgs[] = {&pmeGrid->getDevicePointer()}; void* finishSpreadArgs[] = {&originalPmeGrid->getDevicePointer()};
cu.executeKernel(pmeFinishSpreadChargeKernel, finishSpreadArgs, pmeGrid->getSize());
if (cu.getUseDoublePrecision() || cu.getComputeCapability() < 2.0) {
void* finishSpreadArgs[] = {&originalPmeGrid->getDevicePointer()};
cu.executeKernel(pmeFinishSpreadChargeKernel, finishSpreadArgs, originalPmeGrid->getSize());
}
if (cu.getUseDoublePrecision()) if (cu.getUseDoublePrecision())
cufftExecZ2Z(fft, (double2*) pmeGrid->getDevicePointer(), (double2*) pmeGrid->getDevicePointer(), CUFFT_FORWARD); cufftExecD2Z(fftForward, (double*) originalPmeGrid->getDevicePointer(), (double2*) reciprocalPmeGrid->getDevicePointer());
else else
cufftExecC2C(fft, (float2*) pmeGrid->getDevicePointer(), (float2*) pmeGrid->getDevicePointer(), CUFFT_FORWARD); cufftExecR2C(fftForward, (float*) originalPmeGrid->getDevicePointer(), (float2*) reciprocalPmeGrid->getDevicePointer());
void* convolutionArgs[] = {&pmeGrid->getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(), &pmeBsplineModuliX->getDevicePointer(),
&pmeBsplineModuliY->getDevicePointer(), &pmeBsplineModuliZ->getDevicePointer(), cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()}; void* convolutionArgs[] = {&reciprocalPmeGrid->getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(), &pmeBsplineModuliX->getDevicePointer(), &pmeBsplineModuliY->getDevicePointer(), &pmeBsplineModuliZ->getDevicePointer(), cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()};
cu.executeKernel(pmeConvolutionKernel, convolutionArgs, cu.getNumAtoms()); cu.executeKernel(pmeConvolutionKernel, convolutionArgs, cu.getNumAtoms());
if (cu.getUseDoublePrecision()) if (cu.getUseDoublePrecision())
cufftExecZ2Z(fft, (double2*) pmeGrid->getDevicePointer(), (double2*) pmeGrid->getDevicePointer(), CUFFT_INVERSE); cufftExecZ2D(fftBackward, (double2*) reciprocalPmeGrid->getDevicePointer(), (double*) convolvedPmeGrid->getDevicePointer());
else else
cufftExecC2C(fft, (float2*) pmeGrid->getDevicePointer(), (float2*) pmeGrid->getDevicePointer(), CUFFT_INVERSE); cufftExecC2R(fftBackward, (float2*) reciprocalPmeGrid->getDevicePointer(), (float*) convolvedPmeGrid->getDevicePointer());
void* interpolateArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &pmeGrid->getDevicePointer(),
void* computeEnergyArgs[] = {&originalPmeGrid->getDevicePointer(), &convolvedPmeGrid->getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer() };
cu.executeKernel(pmeEvalEnergyKernel, computeEnergyArgs, cu.getNumAtoms());
void* interpolateArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &convolvedPmeGrid->getDevicePointer(),
cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()}; cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()};
cu.executeKernel(pmeInterpolateForceKernel, interpolateArgs, cu.getNumAtoms()); cu.executeKernel(pmeInterpolateForceKernel, interpolateArgs, cu.getNumAtoms());
} }
double energy = (includeReciprocal ? ewaldSelfEnergy : 0.0); double energy = (includeReciprocal ? ewaldSelfEnergy : 0.0);
if (dispersionCoefficient != 0.0 && includeDirect) { if (dispersionCoefficient != 0.0 && includeDirect) {
......
...@@ -549,7 +549,7 @@ private: ...@@ -549,7 +549,7 @@ private:
class CudaCalcNonbondedForceKernel : public CalcNonbondedForceKernel { class CudaCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
public: public:
CudaCalcNonbondedForceKernel(std::string name, const Platform& platform, CudaContext& cu, System& system) : CalcNonbondedForceKernel(name, platform), CudaCalcNonbondedForceKernel(std::string name, const Platform& platform, CudaContext& cu, System& system) : CalcNonbondedForceKernel(name, platform),
cu(cu), hasInitializedFFT(false), sigmaEpsilon(NULL), exceptionParams(NULL), cosSinSums(NULL), pmeGrid(NULL), cu(cu), hasInitializedFFT(false), sigmaEpsilon(NULL), exceptionParams(NULL), cosSinSums(NULL), originalPmeGrid(NULL), reciprocalPmeGrid(NULL), convolvedPmeGrid(NULL),
pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeBsplineTheta(NULL), pmeBsplineDTheta(NULL), pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeBsplineTheta(NULL), pmeBsplineDTheta(NULL),
pmeAtomRange(NULL), pmeAtomGridIndex(NULL), sort(NULL) { pmeAtomRange(NULL), pmeAtomGridIndex(NULL), sort(NULL) {
} }
...@@ -595,7 +595,13 @@ private: ...@@ -595,7 +595,13 @@ private:
CudaArray* sigmaEpsilon; CudaArray* sigmaEpsilon;
CudaArray* exceptionParams; CudaArray* exceptionParams;
CudaArray* cosSinSums; CudaArray* cosSinSums;
CudaArray* pmeGrid;
//TODO: separate into realpmeGrid, complex pmegrid, and resultpmeGrid
CudaArray* originalPmeGrid;
CudaArray* reciprocalPmeGrid;
CudaArray* convolvedPmeGrid;
CudaArray* pmeBsplineModuliX; CudaArray* pmeBsplineModuliX;
CudaArray* pmeBsplineModuliY; CudaArray* pmeBsplineModuliY;
CudaArray* pmeBsplineModuliZ; CudaArray* pmeBsplineModuliZ;
...@@ -604,7 +610,10 @@ private: ...@@ -604,7 +610,10 @@ private:
CudaArray* pmeAtomRange; CudaArray* pmeAtomRange;
CudaArray* pmeAtomGridIndex; CudaArray* pmeAtomGridIndex;
CudaSort* sort; CudaSort* sort;
cufftHandle fft;
cufftHandle fftForward;
cufftHandle fftBackward;
CUfunction ewaldSumsKernel; CUfunction ewaldSumsKernel;
CUfunction ewaldForcesKernel; CUfunction ewaldForcesKernel;
CUfunction pmeGridIndexKernel; CUfunction pmeGridIndexKernel;
...@@ -613,6 +622,9 @@ private: ...@@ -613,6 +622,9 @@ private:
CUfunction pmeUpdateBsplinesKernel; CUfunction pmeUpdateBsplinesKernel;
CUfunction pmeSpreadChargeKernel; CUfunction pmeSpreadChargeKernel;
CUfunction pmeFinishSpreadChargeKernel; CUfunction pmeFinishSpreadChargeKernel;
/* TESTING ENERGY KERNEL */
CUfunction pmeEvalEnergyKernel;
CUfunction pmeConvolutionKernel; CUfunction pmeConvolutionKernel;
CUfunction pmeInterpolateForceKernel; CUfunction pmeInterpolateForceKernel;
std::map<std::string, std::string> pmeDefines; std::map<std::string, std::string> pmeDefines;
......
...@@ -53,7 +53,6 @@ extern "C" __global__ void findAtomRangeForGrid(int2* __restrict__ pmeAtomGridIn ...@@ -53,7 +53,6 @@ extern "C" __global__ void findAtomRangeForGrid(int2* __restrict__ pmeAtomGridIn
} }
// Fill in values beyond the last atom. // Fill in values beyond the last atom.
if (blockIdx.x == gridDim.x-1 && threadIdx.x == blockDim.x-1) { if (blockIdx.x == gridDim.x-1 && threadIdx.x == blockDim.x-1) {
int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z; int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
for (int j = last+1; j <= gridSize; ++j) for (int j = last+1; j <= gridSize; ++j)
...@@ -62,8 +61,8 @@ extern "C" __global__ void findAtomRangeForGrid(int2* __restrict__ pmeAtomGridIn ...@@ -62,8 +61,8 @@ extern "C" __global__ void findAtomRangeForGrid(int2* __restrict__ pmeAtomGridIn
} }
#define BUFFER_SIZE (PME_ORDER*PME_ORDER*PME_ORDER) #define BUFFER_SIZE (PME_ORDER*PME_ORDER*PME_ORDER)
extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, real* __restrict__ originalPmeGrid,
extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, unsigned long long* __restrict__ pmeGrid, const real4* __restrict__ pmeBsplineTheta, real4 periodicBoxSize, real4 invPeriodicBoxSize) { const real4* __restrict__ pmeBsplineTheta, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
int ix = threadIdx.x/(PME_ORDER*PME_ORDER); int ix = threadIdx.x/(PME_ORDER*PME_ORDER);
int remainder = threadIdx.x-ix*PME_ORDER*PME_ORDER; int remainder = threadIdx.x-ix*PME_ORDER*PME_ORDER;
int iy = remainder/PME_ORDER; int iy = remainder/PME_ORDER;
...@@ -103,66 +102,102 @@ extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, unsi ...@@ -103,66 +102,102 @@ extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, unsi
y -= (y >= GRID_SIZE_Y ? GRID_SIZE_Y : 0); y -= (y >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
z -= (z >= GRID_SIZE_Z ? GRID_SIZE_Z : 0); z -= (z >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
#ifdef USE_DOUBLE_PRECISION #ifdef USE_DOUBLE_PRECISION
atomicAdd(&pmeGrid[2*(x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z)], static_cast<unsigned long long>((long long) (add*0xFFFFFFFF))); unsigned long long * ulonglong_p = (unsigned long long *) originalPmeGrid;
atomicAdd(&ulonglong_p[x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z], static_cast<unsigned long long>((long long) (add*0xFFFFFFFF)));
#elif __CUDA_ARCH__ < 200
unsigned long long * ulonglong_p = (unsigned long long *) originalPmeGrid;
int gridIndex = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z;
gridIndex = (gridIndex%2 == 0 ? gridIndex/2 : (gridIndex+GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z)/2);
atomicAdd(&ulonglong_p[gridIndex], static_cast<unsigned long long>((long long) (add*0xFFFFFFFF)));
#else #else
atomicAdd(&pmeGrid[x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z], static_cast<unsigned long long>((long long) (add*0xFFFFFFFF))); atomicAdd(&originalPmeGrid[x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z], add*EPSILON_FACTOR);
#endif #endif
} }
} }
} }
} }
extern "C" __global__ void finishSpreadCharge(long long* __restrict__ pmeGrid) { extern "C" __global__ void finishSpreadCharge(long long* __restrict__ originalPmeGrid) {
real2* floatGrid = (real2*) pmeGrid; real* floatGrid = (real*) originalPmeGrid;
const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z; const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
real scale = EPSILON_FACTOR/(real) 0xFFFFFFFF; real scale = EPSILON_FACTOR/(real) 0xFFFFFFFF;
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x) {
#ifdef USE_DOUBLE_PRECISION #ifdef USE_DOUBLE_PRECISION
long long value = pmeGrid[2*index]; for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x)
floatGrid[index] = scale*originalPmeGrid[index];
#else #else
long long value = pmeGrid[index]; for (int index = 2*(blockIdx.x*blockDim.x+threadIdx.x); index < gridSize; index += 2*blockDim.x*gridDim.x) {
#endif floatGrid[index] = scale*originalPmeGrid[index/2];
real2 floatValue = make_real2((real) (value*scale), 0); if (index+1 < gridSize)
floatGrid[index] = floatValue; floatGrid[index+1] = scale*originalPmeGrid[(index+gridSize+1)/2];
} }
#endif
} }
extern "C" __global__ void reciprocalConvolution(real2* __restrict__ pmeGrid, real* __restrict__ energyBuffer, const real* __restrict__ pmeBsplineModuliX, // convolutes on the halfcomplex_pmeGrid, which is of size NX*NY*(NZ/2+1) as F(Q) is conjugate symmetric
const real* __restrict__ pmeBsplineModuliY, const real* __restrict__ pmeBsplineModuliZ, real4 periodicBoxSize, real4 invPeriodicBoxSize) { extern "C" __global__ void
const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z; reciprocalConvolution(real2* __restrict__ halfcomplex_pmeGrid, real* __restrict__ energyBuffer,
const real* __restrict__ pmeBsplineModuliX,
const real* __restrict__ pmeBsplineModuliY, const real* __restrict__ pmeBsplineModuliZ,
real4 periodicBoxSize, real4 invPeriodicBoxSize) {
//R2C stores into a half complex matrix where the last dimension is cut by half
const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*(GRID_SIZE_Z/2+1);
const real recipScaleFactor = RECIP(M_PI*periodicBoxSize.x*periodicBoxSize.y*periodicBoxSize.z); const real recipScaleFactor = RECIP(M_PI*periodicBoxSize.x*periodicBoxSize.y*periodicBoxSize.z);
real energy = 0;
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x) { for( int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x ) {
int kx = index/(GRID_SIZE_Y*GRID_SIZE_Z);
int remainder = index-kx*GRID_SIZE_Y*GRID_SIZE_Z; // real indices
int ky = remainder/GRID_SIZE_Z; int kx = index/(GRID_SIZE_Y*(GRID_SIZE_Z/2+1));
int kz = remainder-ky*GRID_SIZE_Z; int remainder = index-kx*GRID_SIZE_Y*(GRID_SIZE_Z/2+1);
if (kx == 0 && ky == 0 && kz == 0) int ky = remainder/(GRID_SIZE_Z/2+1);
continue; int kz = remainder-ky*(GRID_SIZE_Z/2+1);
// reciprocal space indices
int mx = (kx < (GRID_SIZE_X+1)/2) ? kx : (kx-GRID_SIZE_X); int mx = (kx < (GRID_SIZE_X+1)/2) ? kx : (kx-GRID_SIZE_X);
int my = (ky < (GRID_SIZE_Y+1)/2) ? ky : (ky-GRID_SIZE_Y); int my = (ky < (GRID_SIZE_Y+1)/2) ? ky : (ky-GRID_SIZE_Y);
int mz = (kz < (GRID_SIZE_Z+1)/2) ? kz : (kz-GRID_SIZE_Z); int mz = (kz < (GRID_SIZE_Z+1)/2) ? kz : (kz-GRID_SIZE_Z);
// find the coordinates of the reciprocal space vectors
real mhx = mx*invPeriodicBoxSize.x; real mhx = mx*invPeriodicBoxSize.x;
real mhy = my*invPeriodicBoxSize.y; real mhy = my*invPeriodicBoxSize.y;
real mhz = mz*invPeriodicBoxSize.z; real mhz = mz*invPeriodicBoxSize.z;
real bx = pmeBsplineModuliX[kx]; real bx = pmeBsplineModuliX[kx];
real by = pmeBsplineModuliY[ky]; real by = pmeBsplineModuliY[ky];
real bz = pmeBsplineModuliZ[kz]; real bz = pmeBsplineModuliZ[kz];
real2 grid = pmeGrid[index];
real2 grid = halfcomplex_pmeGrid[index];
real m2 = mhx*mhx+mhy*mhy+mhz*mhz; real m2 = mhx*mhx+mhy*mhy+mhz*mhz;
real denom = m2*bx*by*bz; real denom = m2*bx*by*bz;
real eterm = recipScaleFactor*EXP(-RECIP_EXP_FACTOR*m2)/denom; real eterm = recipScaleFactor*EXP(-RECIP_EXP_FACTOR*m2)/denom;
pmeGrid[index] = make_real2(grid.x*eterm, grid.y*eterm);
energy += eterm*(grid.x*grid.x + grid.y*grid.y); if (kx != 0 || ky != 0 || kz != 0) {
halfcomplex_pmeGrid[index] = make_real2(grid.x*eterm, grid.y*eterm);
}
} }
energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += 0.5f*energy;
} }
extern "C" __global__ void gridInterpolateForce(const real4* __restrict__ posq, unsigned long long* __restrict__ forceBuffers, const real2* __restrict__ pmeGrid, extern "C" __global__
void gridEvaluateEnergy(const real * __restrict__ originalGrid, const real * __restrict convolvedGrid, real * __restrict__ energyBuffer) {
const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
real energy = 0;
for( int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x ) {
energy += originalGrid[index]*convolvedGrid[index];
}
energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += 0.5*energy;
}
extern "C" __global__
void gridInterpolateForce(const real4* __restrict__ posq, unsigned long long* __restrict__ forceBuffers, const real* __restrict__ originalPmeGrid,
real4 periodicBoxSize, real4 invPeriodicBoxSize) { real4 periodicBoxSize, real4 invPeriodicBoxSize) {
real3 data[PME_ORDER]; real3 data[PME_ORDER];
real3 ddata[PME_ORDER]; real3 ddata[PME_ORDER];
const real scale = RECIP(PME_ORDER-1); const real scale = RECIP(PME_ORDER-1);
for (int atom = blockIdx.x*blockDim.x+threadIdx.x; atom < NUM_ATOMS; atom += blockDim.x*gridDim.x) { for (int atom = blockIdx.x*blockDim.x+threadIdx.x; atom < NUM_ATOMS; atom += blockDim.x*gridDim.x) {
real3 force = make_real3(0); real3 force = make_real3(0);
real4 pos = posq[atom]; real4 pos = posq[atom];
...@@ -178,45 +213,52 @@ extern "C" __global__ void gridInterpolateForce(const real4* __restrict__ posq, ...@@ -178,45 +213,52 @@ extern "C" __global__ void gridInterpolateForce(const real4* __restrict__ posq,
// Since we need the full set of thetas, it's faster to compute them here than load them // Since we need the full set of thetas, it's faster to compute them here than load them
// from global memory. // from global memory.
real3 dr = make_real3(t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z); real3 dr = make_real3(t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z);
data[PME_ORDER-1] = make_real3(0); data[PME_ORDER-1] = make_real3(0);
data[1] = dr; data[1] = dr;
data[0] = make_real3(1)-dr; data[0] = make_real3(1)-dr;
for (int j = 3; j < PME_ORDER; j++) { for (int j = 3; j < PME_ORDER; j++) {
real div = RECIP(j-1); real div = RECIP(j-1);
data[j-1] = div*dr*data[j-2]; data[j-1] = div*dr*data[j-2];
for (int k = 1; k < (j-1); k++) for (int k = 1; k < (j-1); k++)
data[j-k-1] = div*((dr+make_real3(k))*data[j-k-2] + (make_real3(j-k)-dr)*data[j-k-1]); data[j-k-1] = div*((dr+make_real3(k))*data[j-k-2] + (make_real3(j-k)-dr)*data[j-k-1]);
data[0] = div*(make_real3(1)-dr)*data[0]; data[0] = div*(make_real3(1)-dr)*data[0];
} }
ddata[0] = -data[0]; ddata[0] = -data[0];
for (int j = 1; j < PME_ORDER; j++) for (int j = 1; j < PME_ORDER; j++)
ddata[j] = data[j-1]-data[j]; ddata[j] = data[j-1]-data[j];
data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2]; data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
for (int j = 1; j < (PME_ORDER-1); j++) for (int j = 1; j < (PME_ORDER-1); j++)
data[PME_ORDER-j-1] = scale*((dr+make_real3(j))*data[PME_ORDER-j-2] + (make_real3(PME_ORDER-j)-dr)*data[PME_ORDER-j-1]); data[PME_ORDER-j-1] = scale*((dr+make_real3(j))*data[PME_ORDER-j-2] + (make_real3(PME_ORDER-j)-dr)*data[PME_ORDER-j-1]);
data[0] = scale*(make_real3(1)-dr)*data[0]; data[0] = scale*(make_real3(1)-dr)*data[0];
// Compute the force on this atom. // Compute the force on this atom.
for (int ix = 0; ix < PME_ORDER; ix++) { for (int ix = 0; ix < PME_ORDER; ix++) {
int xbase = gridIndex.x+ix; int xbase = gridIndex.x+ix;
xbase -= (xbase >= GRID_SIZE_X ? GRID_SIZE_X : 0); xbase -= (xbase >= GRID_SIZE_X ? GRID_SIZE_X : 0);
xbase = xbase*GRID_SIZE_Y*GRID_SIZE_Z; xbase = xbase*GRID_SIZE_Y*GRID_SIZE_Z;
real dx = data[ix].x; real dx = data[ix].x;
real ddx = ddata[ix].x; real ddx = ddata[ix].x;
for (int iy = 0; iy < PME_ORDER; iy++) { for (int iy = 0; iy < PME_ORDER; iy++) {
int ybase = gridIndex.y+iy; int ybase = gridIndex.y+iy;
ybase -= (ybase >= GRID_SIZE_Y ? GRID_SIZE_Y : 0); ybase -= (ybase >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
ybase = xbase + ybase*GRID_SIZE_Z; ybase = xbase + ybase*GRID_SIZE_Z;
real dy = data[iy].y; real dy = data[iy].y;
real ddy = ddata[iy].y; real ddy = ddata[iy].y;
for (int iz = 0; iz < PME_ORDER; iz++) { for (int iz = 0; iz < PME_ORDER; iz++) {
int zindex = gridIndex.z+iz; int zindex = gridIndex.z+iz;
zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0); zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
int index = ybase + zindex; int index = ybase + zindex;
real gridvalue = pmeGrid[index].x; real gridvalue = originalPmeGrid[index];
force.x += ddx*dy*data[iz].z*gridvalue; force.x += ddx*dy*data[iz].z*gridvalue;
force.y += dx*ddy*data[iz].z*gridvalue; force.y += dx*ddy*data[iz].z*gridvalue;
force.z += dx*dy*ddata[iz].z*gridvalue; force.z += dx*dy*ddata[iz].z*gridvalue;
...@@ -224,8 +266,9 @@ extern "C" __global__ void gridInterpolateForce(const real4* __restrict__ posq, ...@@ -224,8 +266,9 @@ extern "C" __global__ void gridInterpolateForce(const real4* __restrict__ posq,
} }
} }
real q = pos.w*EPSILON_FACTOR; real q = pos.w*EPSILON_FACTOR;
atomicAdd(&forceBuffers[atom], static_cast<unsigned long long>((long long) (-q*force.x*GRID_SIZE_X*invPeriodicBoxSize.x*0xFFFFFFFF)));
atomicAdd(&forceBuffers[atom+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (-q*force.y*GRID_SIZE_Y*invPeriodicBoxSize.y*0xFFFFFFFF))); forceBuffers[atom] += static_cast<unsigned long long>((long long) (-q*force.x*GRID_SIZE_X*invPeriodicBoxSize.x*0xFFFFFFFF));
atomicAdd(&forceBuffers[atom+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (-q*force.z*GRID_SIZE_Z*invPeriodicBoxSize.z*0xFFFFFFFF))); forceBuffers[atom+PADDED_NUM_ATOMS] += static_cast<unsigned long long>((long long) (-q*force.y*GRID_SIZE_Y*invPeriodicBoxSize.y*0xFFFFFFFF));
forceBuffers[atom+2*PADDED_NUM_ATOMS] += static_cast<unsigned long long>((long long) (-q*force.z*GRID_SIZE_Z*invPeriodicBoxSize.z*0xFFFFFFFF));
} }
} }
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment