Commit ad62d81e authored by Peter Eastman's avatar Peter Eastman
Browse files

OpenCL FFT stores only non-redundant elements

parent 2db04610
...@@ -69,6 +69,9 @@ public: ...@@ -69,6 +69,9 @@ public:
* arrays must be different. Also, the input array is used as workspace, so its contents * arrays must be different. Also, the input array is used as workspace, so its contents
* are destroyed. This also means that both arrays must be large enough to hold complex values, * are destroyed. This also means that both arrays must be large enough to hold complex values,
* even when performing a real-to-complex transform. * even when performing a real-to-complex transform.
* <p>
* When performing a real-to-complex transform, the output data is of size xsize*ysize*(zsize/2+1)
* and contains only the non-redundant elements.
* *
* @param in the data to transform, ordered such that in[x*ysize*zsize + y*zsize + z] contains element (x, y, z) * @param in the data to transform, ordered such that in[x*ysize*zsize + y*zsize + z] contains element (x, y, z)
* @param out on exit, this contains the transformed data * @param out on exit, this contains the transformed data
......
...@@ -639,6 +639,7 @@ private: ...@@ -639,6 +639,7 @@ private:
cl::Kernel pmeSpreadChargeKernel; cl::Kernel pmeSpreadChargeKernel;
cl::Kernel pmeFinishSpreadChargeKernel; cl::Kernel pmeFinishSpreadChargeKernel;
cl::Kernel pmeConvolutionKernel; cl::Kernel pmeConvolutionKernel;
cl::Kernel pmeEvalEnergyKernel;
cl::Kernel pmeInterpolateForceKernel; cl::Kernel pmeInterpolateForceKernel;
std::map<std::string, std::string> pmeDefines; std::map<std::string, std::string> pmeDefines;
std::vector<std::pair<int, int> > exceptionAtoms; std::vector<std::pair<int, int> > exceptionAtoms;
......
...@@ -310,6 +310,7 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa ...@@ -310,6 +310,7 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
// Create the kernel. // Create the kernel.
bool outputIsReal = (inputIsReal && axis == 2 && !forward); bool outputIsReal = (inputIsReal && axis == 2 && !forward);
bool outputIsPacked = (inputIsReal && axis == 2 && forward);
string outputSuffix = (outputIsReal ? ".x" : ""); string outputSuffix = (outputIsReal ? ".x" : "");
if (loopRequired) { if (loopRequired) {
source<<"for (int z = get_local_id(0); z < ZSIZE; z += get_local_size(0))\n"; source<<"for (int z = get_local_id(0); z < ZSIZE; z += get_local_size(0))\n";
...@@ -317,7 +318,10 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa ...@@ -317,7 +318,10 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
} }
else { else {
source<<"if (index < XSIZE*YSIZE)\n"; source<<"if (index < XSIZE*YSIZE)\n";
source<<"out[y*(ZSIZE*XSIZE)+(get_local_id(0)%ZSIZE)*XSIZE+x] = data"<<(stage%2)<<"[get_local_id(0)]"<<outputSuffix<<";\n"; if (outputIsPacked)
source<<"out[y*(ZSIZE*(XSIZE/2+1))+(get_local_id(0)%ZSIZE)*(XSIZE/2+1)+x] = data"<<(stage%2)<<"[get_local_id(0)]"<<outputSuffix<<";\n";
else
source<<"out[y*(ZSIZE*XSIZE)+(get_local_id(0)%ZSIZE)*XSIZE+x] = data"<<(stage%2)<<"[get_local_id(0)]"<<outputSuffix<<";\n";
} }
map<string, string> replacements; map<string, string> replacements;
replacements["XSIZE"] = context.intToString(xsize); replacements["XSIZE"] = context.intToString(xsize);
...@@ -331,6 +335,8 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa ...@@ -331,6 +335,8 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
replacements["INPUT_TYPE"] = (inputIsReal && axis == 0 && forward ? "real" : "real2"); replacements["INPUT_TYPE"] = (inputIsReal && axis == 0 && forward ? "real" : "real2");
replacements["OUTPUT_TYPE"] = (outputIsReal ? "real" : "real2"); replacements["OUTPUT_TYPE"] = (outputIsReal ? "real" : "real2");
replacements["INPUT_IS_REAL"] = (inputIsReal && axis == 0 && forward ? "1" : "0"); replacements["INPUT_IS_REAL"] = (inputIsReal && axis == 0 && forward ? "1" : "0");
replacements["INPUT_IS_PACKED"] = (inputIsReal && axis == 0 && !forward ? "1" : "0");
replacements["OUTPUT_IS_PACKED"] = (outputIsPacked ? "1" : "0");
cl::Program program = context.createProgram(context.replaceStrings(OpenCLKernelSources::fft, replacements)); cl::Program program = context.createProgram(context.replaceStrings(OpenCLKernelSources::fft, replacements));
cl::Kernel kernel(program, "execFFT"); cl::Kernel kernel(program, "execFFT");
threads = (isCPU ? 1 : blocksPerGroup*zsize); threads = (isCPU ? 1 : blocksPerGroup*zsize);
......
...@@ -1806,6 +1806,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ ...@@ -1806,6 +1806,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
pmeZIndexKernel = cl::Kernel(program, "recordZIndex"); pmeZIndexKernel = cl::Kernel(program, "recordZIndex");
pmeSpreadChargeKernel = cl::Kernel(program, "gridSpreadCharge"); pmeSpreadChargeKernel = cl::Kernel(program, "gridSpreadCharge");
pmeConvolutionKernel = cl::Kernel(program, "reciprocalConvolution"); pmeConvolutionKernel = cl::Kernel(program, "reciprocalConvolution");
pmeEvalEnergyKernel = cl::Kernel(program, "gridEvaluateEnergy");
pmeInterpolateForceKernel = cl::Kernel(program, "gridInterpolateForce"); pmeInterpolateForceKernel = cl::Kernel(program, "gridInterpolateForce");
int elementSize = (cl.getUseDoublePrecision() ? sizeof(mm_double4) : sizeof(mm_float4)); int elementSize = (cl.getUseDoublePrecision() ? sizeof(mm_double4) : sizeof(mm_float4));
pmeUpdateBsplinesKernel.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer()); pmeUpdateBsplinesKernel.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer());
...@@ -1826,10 +1827,14 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ ...@@ -1826,10 +1827,14 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
pmeSpreadChargeKernel.setArg<cl::Buffer>(3, pmeGrid->getDeviceBuffer()); pmeSpreadChargeKernel.setArg<cl::Buffer>(3, pmeGrid->getDeviceBuffer());
pmeSpreadChargeKernel.setArg<cl::Buffer>(4, pmeBsplineTheta->getDeviceBuffer()); pmeSpreadChargeKernel.setArg<cl::Buffer>(4, pmeBsplineTheta->getDeviceBuffer());
pmeConvolutionKernel.setArg<cl::Buffer>(0, pmeGrid2->getDeviceBuffer()); pmeConvolutionKernel.setArg<cl::Buffer>(0, pmeGrid2->getDeviceBuffer());
pmeConvolutionKernel.setArg<cl::Buffer>(1, cl.getEnergyBuffer().getDeviceBuffer()); pmeConvolutionKernel.setArg<cl::Buffer>(1, pmeBsplineModuliX->getDeviceBuffer());
pmeConvolutionKernel.setArg<cl::Buffer>(2, pmeBsplineModuliX->getDeviceBuffer()); pmeConvolutionKernel.setArg<cl::Buffer>(2, pmeBsplineModuliY->getDeviceBuffer());
pmeConvolutionKernel.setArg<cl::Buffer>(3, pmeBsplineModuliY->getDeviceBuffer()); pmeConvolutionKernel.setArg<cl::Buffer>(3, pmeBsplineModuliZ->getDeviceBuffer());
pmeConvolutionKernel.setArg<cl::Buffer>(4, pmeBsplineModuliZ->getDeviceBuffer()); pmeEvalEnergyKernel.setArg<cl::Buffer>(0, pmeGrid2->getDeviceBuffer());
pmeEvalEnergyKernel.setArg<cl::Buffer>(1, cl.getEnergyBuffer().getDeviceBuffer());
pmeEvalEnergyKernel.setArg<cl::Buffer>(2, pmeBsplineModuliX->getDeviceBuffer());
pmeEvalEnergyKernel.setArg<cl::Buffer>(3, pmeBsplineModuliY->getDeviceBuffer());
pmeEvalEnergyKernel.setArg<cl::Buffer>(4, pmeBsplineModuliZ->getDeviceBuffer());
pmeInterpolateForceKernel.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer()); pmeInterpolateForceKernel.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer());
pmeInterpolateForceKernel.setArg<cl::Buffer>(1, cl.getForceBuffers().getDeviceBuffer()); pmeInterpolateForceKernel.setArg<cl::Buffer>(1, cl.getForceBuffers().getDeviceBuffer());
pmeInterpolateForceKernel.setArg<cl::Buffer>(2, pmeGrid->getDeviceBuffer()); pmeInterpolateForceKernel.setArg<cl::Buffer>(2, pmeGrid->getDeviceBuffer());
...@@ -1861,7 +1866,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ ...@@ -1861,7 +1866,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
cl.executeKernel(ewaldForcesKernel, cl.getNumAtoms()); cl.executeKernel(ewaldForcesKernel, cl.getNumAtoms());
} }
if (pmeGrid != NULL && includeReciprocal) { if (pmeGrid != NULL && includeReciprocal) {
if (usePmeQueue) if (usePmeQueue && !includeEnergy)
cl.setQueue(pmeQueue); cl.setQueue(pmeQueue);
// Invert the periodic box vectors. // Invert the periodic box vectors.
...@@ -1936,19 +1941,24 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ ...@@ -1936,19 +1941,24 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
} }
fft->execFFT(*pmeGrid, *pmeGrid2, true); fft->execFFT(*pmeGrid, *pmeGrid2, true);
mm_double4 boxSize = cl.getPeriodicBoxSizeDouble(); mm_double4 boxSize = cl.getPeriodicBoxSizeDouble();
double scaleFactor = 1.0/(M_PI*boxSize.x*boxSize.y*boxSize.z);
if (cl.getUseDoublePrecision()) { if (cl.getUseDoublePrecision()) {
pmeConvolutionKernel.setArg<mm_double4>(5, recipBoxVectors[0]); pmeConvolutionKernel.setArg<mm_double4>(4, recipBoxVectors[0]);
pmeConvolutionKernel.setArg<mm_double4>(6, recipBoxVectors[1]); pmeConvolutionKernel.setArg<mm_double4>(5, recipBoxVectors[1]);
pmeConvolutionKernel.setArg<mm_double4>(7, recipBoxVectors[2]); pmeConvolutionKernel.setArg<mm_double4>(6, recipBoxVectors[2]);
pmeConvolutionKernel.setArg<cl_double>(8, scaleFactor); pmeEvalEnergyKernel.setArg<mm_double4>(5, recipBoxVectors[0]);
pmeEvalEnergyKernel.setArg<mm_double4>(6, recipBoxVectors[1]);
pmeEvalEnergyKernel.setArg<mm_double4>(7, recipBoxVectors[2]);
} }
else { else {
pmeConvolutionKernel.setArg<mm_float4>(5, recipBoxVectorsFloat[0]); pmeConvolutionKernel.setArg<mm_float4>(4, recipBoxVectorsFloat[0]);
pmeConvolutionKernel.setArg<mm_float4>(6, recipBoxVectorsFloat[1]); pmeConvolutionKernel.setArg<mm_float4>(5, recipBoxVectorsFloat[1]);
pmeConvolutionKernel.setArg<mm_float4>(7, recipBoxVectorsFloat[2]); pmeConvolutionKernel.setArg<mm_float4>(6, recipBoxVectorsFloat[2]);
pmeConvolutionKernel.setArg<cl_float>(8, (float) scaleFactor); pmeEvalEnergyKernel.setArg<mm_float4>(5, recipBoxVectorsFloat[0]);
} pmeEvalEnergyKernel.setArg<mm_float4>(6, recipBoxVectorsFloat[1]);
pmeEvalEnergyKernel.setArg<mm_float4>(7, recipBoxVectorsFloat[2]);
}
if (includeEnergy)
cl.executeKernel(pmeEvalEnergyKernel, cl.getNumAtoms());
cl.executeKernel(pmeConvolutionKernel, cl.getNumAtoms()); cl.executeKernel(pmeConvolutionKernel, cl.getNumAtoms());
fft->execFFT(*pmeGrid2, *pmeGrid, false); fft->execFFT(*pmeGrid2, *pmeGrid, false);
setPeriodicBoxSizeArg(cl, pmeInterpolateForceKernel, 3); setPeriodicBoxSizeArg(cl, pmeInterpolateForceKernel, 3);
......
...@@ -2,6 +2,19 @@ real2 multiplyComplex(real2 c1, real2 c2) { ...@@ -2,6 +2,19 @@ real2 multiplyComplex(real2 c1, real2 c2) {
return (real2) (c1.x*c2.x-c1.y*c2.y, c1.x*c2.y+c1.y*c2.x); return (real2) (c1.x*c2.x-c1.y*c2.y, c1.x*c2.y+c1.y*c2.x);
} }
/**
* Load a value from the half-complex grid produces by a real-to-complex transform.
*/
real2 loadComplexValue(__global const real2* restrict in, int x, int y, int z) {
const int inputZSize = ZSIZE/2+1;
if (z < inputZSize)
return in[x*YSIZE*inputZSize+y*inputZSize+z];
int xp = (x == 0 ? 0 : XSIZE-x);
int yp = (y == 0 ? 0 : YSIZE-y);
real2 value = in[xp*YSIZE*inputZSize+yp*inputZSize+(ZSIZE-z)];
return (real2) (value.x, -value.y);
}
/** /**
* Perform a 1D FFT on each row along one axis. * Perform a 1D FFT on each row along one axis.
*/ */
...@@ -16,10 +29,16 @@ __kernel void execFFT(__global const INPUT_TYPE* restrict in, __global OUTPUT_TY ...@@ -16,10 +29,16 @@ __kernel void execFFT(__global const INPUT_TYPE* restrict in, __global OUTPUT_TY
int index = baseIndex+get_local_id(0)/ZSIZE; int index = baseIndex+get_local_id(0)/ZSIZE;
int x = index/YSIZE; int x = index/YSIZE;
int y = index-x*YSIZE; int y = index-x*YSIZE;
#if OUTPUT_IS_PACKED
if (x >= XSIZE/2+1)
continue;
#endif
#if LOOP_REQUIRED #if LOOP_REQUIRED
for (int z = get_local_id(0); z < ZSIZE; z += get_local_size(0)) for (int z = get_local_id(0); z < ZSIZE; z += get_local_size(0))
#if INPUT_IS_REAL #if INPUT_IS_REAL
data0[z] = (real2) (in[x*(YSIZE*ZSIZE)+y*ZSIZE+z], 0); data0[z] = (real2) (in[x*(YSIZE*ZSIZE)+y*ZSIZE+z], 0);
#elif INPUT_IS_PACKED
data0[z] = loadComplexValue(in, x, y, z);
#else #else
data0[z] = in[x*(YSIZE*ZSIZE)+y*ZSIZE+z]; data0[z] = in[x*(YSIZE*ZSIZE)+y*ZSIZE+z];
#endif #endif
...@@ -27,6 +46,8 @@ __kernel void execFFT(__global const INPUT_TYPE* restrict in, __global OUTPUT_TY ...@@ -27,6 +46,8 @@ __kernel void execFFT(__global const INPUT_TYPE* restrict in, __global OUTPUT_TY
if (index < XSIZE*YSIZE) if (index < XSIZE*YSIZE)
#if INPUT_IS_REAL #if INPUT_IS_REAL
data0[get_local_id(0)] = (real2) (in[x*(YSIZE*ZSIZE)+y*ZSIZE+get_local_id(0)%ZSIZE], 0); data0[get_local_id(0)] = (real2) (in[x*(YSIZE*ZSIZE)+y*ZSIZE+get_local_id(0)%ZSIZE], 0);
#elif INPUT_IS_PACKED
data0[get_local_id(0)] = loadComplexValue(in, x, y, get_local_id(0)%ZSIZE);
#else #else
data0[get_local_id(0)] = in[x*(YSIZE*ZSIZE)+y*ZSIZE+get_local_id(0)%ZSIZE]; data0[get_local_id(0)] = in[x*(YSIZE*ZSIZE)+y*ZSIZE+get_local_id(0)%ZSIZE];
#endif #endif
......
...@@ -40,6 +40,7 @@ __kernel void unpackForwardData(__global const real2* restrict in, __global real ...@@ -40,6 +40,7 @@ __kernel void unpackForwardData(__global const real2* restrict in, __global real
// Transform the data. // Transform the data.
const int gridSize = PACKED_XSIZE*PACKED_YSIZE*PACKED_ZSIZE; const int gridSize = PACKED_XSIZE*PACKED_YSIZE*PACKED_ZSIZE;
const int outputZSize = ZSIZE/2+1;
for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) { for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) {
int x = index/(PACKED_YSIZE*PACKED_ZSIZE); int x = index/(PACKED_YSIZE*PACKED_ZSIZE);
int remainder = index-x*(PACKED_YSIZE*PACKED_ZSIZE); int remainder = index-x*(PACKED_YSIZE*PACKED_ZSIZE);
...@@ -58,25 +59,41 @@ __kernel void unpackForwardData(__global const real2* restrict in, __global real ...@@ -58,25 +59,41 @@ __kernel void unpackForwardData(__global const real2* restrict in, __global real
real2 wfac = w[z]; real2 wfac = w[z];
#endif #endif
real2 output = (real2) ((z1.x+z2.x - wfac.x*(z1.x-z2.x) + wfac.y*(z1.y+z2.y))/2, (z1.y-z2.y - wfac.y*(z1.x-z2.x) - wfac.x*(z1.y+z2.y))/2); real2 output = (real2) ((z1.x+z2.x - wfac.x*(z1.x-z2.x) + wfac.y*(z1.y+z2.y))/2, (z1.y-z2.y - wfac.y*(z1.x-z2.x) - wfac.x*(z1.y+z2.y))/2);
out[x*YSIZE*ZSIZE+y*ZSIZE+z] = output; if (z < outputZSize)
out[x*YSIZE*outputZSize+y*outputZSize+z] = output;
xp = (x == 0 ? 0 : XSIZE-x); xp = (x == 0 ? 0 : XSIZE-x);
yp = (y == 0 ? 0 : YSIZE-y); yp = (y == 0 ? 0 : YSIZE-y);
zp = (z == 0 ? 0 : ZSIZE-z); zp = (z == 0 ? 0 : ZSIZE-z);
if (zp < outputZSize) {
#if PACKED_AXIS == 0 #if PACKED_AXIS == 0
if (x == 0) if (x == 0)
out[PACKED_XSIZE*YSIZE*ZSIZE+yp*ZSIZE+zp] = (real2) ((z1.x-z1.y+z2.x-z2.y)/2, (-z1.x-z1.y+z2.x+z2.y)/2); out[PACKED_XSIZE*YSIZE*outputZSize+yp*outputZSize+zp] = (real2) ((z1.x-z1.y+z2.x-z2.y)/2, (-z1.x-z1.y+z2.x+z2.y)/2);
#elif PACKED_AXIS == 1 #elif PACKED_AXIS == 1
if (y == 0) if (y == 0)
out[xp*YSIZE*ZSIZE+PACKED_YSIZE*ZSIZE+zp] = (real2) ((z1.x-z1.y+z2.x-z2.y)/2, (-z1.x-z1.y+z2.x+z2.y)/2); out[xp*YSIZE*outputZSize+PACKED_YSIZE*outputZSize+zp] = (real2) ((z1.x-z1.y+z2.x-z2.y)/2, (-z1.x-z1.y+z2.x+z2.y)/2);
#else #else
if (z == 0) if (z == 0)
out[xp*YSIZE*ZSIZE+yp*ZSIZE+PACKED_ZSIZE] = (real2) ((z1.x-z1.y+z2.x-z2.y)/2, (-z1.x-z1.y+z2.x+z2.y)/2); out[xp*YSIZE*outputZSize+yp*outputZSize+PACKED_ZSIZE] = (real2) ((z1.x-z1.y+z2.x-z2.y)/2, (-z1.x-z1.y+z2.x+z2.y)/2);
#endif #endif
else else
out[xp*YSIZE*ZSIZE+yp*ZSIZE+zp] = (real2) (output.x, -output.y); out[xp*YSIZE*outputZSize+yp*outputZSize+zp] = (real2) (output.x, -output.y);
}
} }
} }
/**
* Load a value from the half-complex grid produced by a real-to-complex transform.
*/
real2 loadComplexValue(__global const real2* restrict in, int x, int y, int z) {
const int inputZSize = ZSIZE/2+1;
if (z < inputZSize)
return in[x*YSIZE*inputZSize+y*inputZSize+z];
int xp = (x == 0 ? 0 : XSIZE-x);
int yp = (y == 0 ? 0 : YSIZE-y);
real2 value = in[xp*YSIZE*inputZSize+yp*inputZSize+(ZSIZE-z)];
return (real2) (value.x, -value.y);
}
/** /**
* Repack the symmetric complex grid into one half as large in preparation for doing an inverse complex-to-real transform. * Repack the symmetric complex grid into one half as large in preparation for doing an inverse complex-to-real transform.
*/ */
...@@ -106,16 +123,16 @@ __kernel void packBackwardData(__global const real2* restrict in, __global real2 ...@@ -106,16 +123,16 @@ __kernel void packBackwardData(__global const real2* restrict in, __global real2
int xp = (x == 0 ? 0 : PACKED_XSIZE-x); int xp = (x == 0 ? 0 : PACKED_XSIZE-x);
int yp = (y == 0 ? 0 : PACKED_YSIZE-y); int yp = (y == 0 ? 0 : PACKED_YSIZE-y);
int zp = (z == 0 ? 0 : PACKED_ZSIZE-z); int zp = (z == 0 ? 0 : PACKED_ZSIZE-z);
real2 z1 = in[x*YSIZE*ZSIZE+y*ZSIZE+z]; real2 z1 = loadComplexValue(in, x, y, z);
#if PACKED_AXIS == 0 #if PACKED_AXIS == 0
real2 wfac = w[x]; real2 wfac = w[x];
real2 z2 = in[(PACKED_XSIZE-x)*YSIZE*ZSIZE+yp*ZSIZE+zp]; real2 z2 = loadComplexValue(in, PACKED_XSIZE-x, yp, zp);
#elif PACKED_AXIS == 1 #elif PACKED_AXIS == 1
real2 wfac = w[y]; real2 wfac = w[y];
real2 z2 = in[xp*YSIZE*ZSIZE+(PACKED_YSIZE-y)*ZSIZE+zp]; real2 z2 = loadComplexValue(in, xp, PACKED_YSIZE-y, zp);
#else #else
real2 wfac = w[z]; real2 wfac = w[z];
real2 z2 = in[xp*YSIZE*ZSIZE+yp*ZSIZE+(PACKED_ZSIZE-z)]; real2 z2 = loadComplexValue(in, xp, yp, PACKED_ZSIZE-z);
#endif #endif
real2 even = (real2) ((z1.x+z2.x)/2, (z1.y-z2.y)/2); real2 even = (real2) ((z1.x+z2.x)/2, (z1.y-z2.y)/2);
real2 odd = (real2) ((z1.x-z2.x)/2, (z1.y+z2.y)/2); real2 odd = (real2) ((z1.x-z2.x)/2, (z1.y+z2.y)/2);
......
...@@ -294,17 +294,18 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con ...@@ -294,17 +294,18 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
} }
#endif #endif
__kernel void reciprocalConvolution(__global real2* restrict pmeGrid, __global real* restrict energyBuffer, __global const real* restrict pmeBsplineModuliX, __kernel void reciprocalConvolution(__global real2* restrict pmeGrid, __global const real* restrict pmeBsplineModuliX,
__global const real* restrict pmeBsplineModuliY, __global const real* restrict pmeBsplineModuliZ, real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ, real recipScaleFactor) { __global const real* restrict pmeBsplineModuliY, __global const real* restrict pmeBsplineModuliZ, real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ) {
const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z; // R2C stores into a half complex matrix where the last dimension is cut by half
real energy = 0.0f; const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*(GRID_SIZE_Z/2+1);
const real recipScaleFactor = (1.0f/M_PI)*recipBoxVecX.x*recipBoxVecY.y*recipBoxVecZ.z;
for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) { for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) {
int kx = index/(GRID_SIZE_Y*GRID_SIZE_Z); // real indices
int remainder = index-kx*GRID_SIZE_Y*GRID_SIZE_Z; int kx = index/(GRID_SIZE_Y*(GRID_SIZE_Z/2+1));
int ky = remainder/GRID_SIZE_Z; int remainder = index-kx*GRID_SIZE_Y*(GRID_SIZE_Z/2+1);
int kz = remainder-ky*GRID_SIZE_Z; int ky = remainder/(GRID_SIZE_Z/2+1);
if (kx == 0 && ky == 0 && kz == 0) int kz = remainder-ky*(GRID_SIZE_Z/2+1);
continue;
int mx = (kx < (GRID_SIZE_X+1)/2) ? kx : (kx-GRID_SIZE_X); int mx = (kx < (GRID_SIZE_X+1)/2) ? kx : (kx-GRID_SIZE_X);
int my = (ky < (GRID_SIZE_Y+1)/2) ? ky : (ky-GRID_SIZE_Y); int my = (ky < (GRID_SIZE_Y+1)/2) ? ky : (ky-GRID_SIZE_Y);
int mz = (kz < (GRID_SIZE_Z+1)/2) ? kz : (kz-GRID_SIZE_Z); int mz = (kz < (GRID_SIZE_Z+1)/2) ? kz : (kz-GRID_SIZE_Z);
...@@ -318,8 +319,48 @@ __kernel void reciprocalConvolution(__global real2* restrict pmeGrid, __global r ...@@ -318,8 +319,48 @@ __kernel void reciprocalConvolution(__global real2* restrict pmeGrid, __global r
real m2 = mhx*mhx+mhy*mhy+mhz*mhz; real m2 = mhx*mhx+mhy*mhy+mhz*mhz;
real denom = m2*bx*by*bz; real denom = m2*bx*by*bz;
real eterm = recipScaleFactor*EXP(-RECIP_EXP_FACTOR*m2)/denom; real eterm = recipScaleFactor*EXP(-RECIP_EXP_FACTOR*m2)/denom;
pmeGrid[index] = (real2) (grid.x*eterm, grid.y*eterm); if (kx != 0 || ky != 0 || kz != 0) {
energy += eterm*(grid.x*grid.x + grid.y*grid.y); pmeGrid[index] = (real2) (grid.x*eterm, grid.y*eterm);
}
}
}
__kernel void gridEvaluateEnergy(__global real2* restrict pmeGrid, __global real* restrict energyBuffer,
__global const real* restrict pmeBsplineModuliX, __global const real* restrict pmeBsplineModuliY, __global const real* restrict pmeBsplineModuliZ,
real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ) {
// R2C stores into a half complex matrix where the last dimension is cut by half
const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
const real recipScaleFactor = (1.0f/M_PI)*recipBoxVecX.x*recipBoxVecY.y*recipBoxVecZ.z;
real energy = 0;
for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) {
// real indices
int kx = index/(GRID_SIZE_Y*(GRID_SIZE_Z));
int remainder = index-kx*GRID_SIZE_Y*(GRID_SIZE_Z);
int ky = remainder/(GRID_SIZE_Z);
int kz = remainder-ky*(GRID_SIZE_Z);
int mx = (kx < (GRID_SIZE_X+1)/2) ? kx : (kx-GRID_SIZE_X);
int my = (ky < (GRID_SIZE_Y+1)/2) ? ky : (ky-GRID_SIZE_Y);
int mz = (kz < (GRID_SIZE_Z+1)/2) ? kz : (kz-GRID_SIZE_Z);
real mhx = mx*recipBoxVecX.x;
real mhy = mx*recipBoxVecY.x+my*recipBoxVecY.y;
real mhz = mx*recipBoxVecZ.x+my*recipBoxVecZ.y+mz*recipBoxVecZ.z;
real m2 = mhx*mhx+mhy*mhy+mhz*mhz;
real bx = pmeBsplineModuliX[kx];
real by = pmeBsplineModuliY[ky];
real bz = pmeBsplineModuliZ[kz];
real denom = m2*bx*by*bz;
real eterm = recipScaleFactor*EXP(-RECIP_EXP_FACTOR*m2)/denom;
if (kz >= (GRID_SIZE_Z/2+1)) {
kx = ((kx == 0) ? kx : GRID_SIZE_X-kx);
ky = ((ky == 0) ? ky : GRID_SIZE_Y-ky);
kz = GRID_SIZE_Z-kz;
}
int indexInHalfComplexGrid = kz + ky*(GRID_SIZE_Z/2+1)+kx*(GRID_SIZE_Y*(GRID_SIZE_Z/2+1));
real2 grid = pmeGrid[indexInHalfComplexGrid];
if (kx != 0 || ky != 0 || kz != 0) {
energy += eterm*(grid.x*grid.x + grid.y*grid.y);
}
} }
energyBuffer[get_global_id(0)] += 0.5f*energy; energyBuffer[get_global_id(0)] += 0.5f*energy;
} }
......
...@@ -85,10 +85,15 @@ void testTransform(bool realToComplex, int xsize, int ysize, int zsize) { ...@@ -85,10 +85,15 @@ void testTransform(bool realToComplex, int xsize, int ysize, int zsize) {
fftpack_t plan; fftpack_t plan;
fftpack_init_3d(&plan, xsize, ysize, zsize); fftpack_init_3d(&plan, xsize, ysize, zsize);
fftpack_exec_3d(plan, FFTPACK_FORWARD, &reference[0], &reference[0]); fftpack_exec_3d(plan, FFTPACK_FORWARD, &reference[0], &reference[0]);
for (int i = 0; i < (int) result.size(); ++i) { int outputZSize = (realToComplex ? zsize/2+1 : zsize);
ASSERT_EQUAL_TOL(reference[i].re, result[i].x, 1e-3); for (int x = 0; x < xsize; x++)
ASSERT_EQUAL_TOL(reference[i].im, result[i].y, 1e-3); for (int y = 0; y < ysize; y++)
} for (int z = 0; z < outputZSize; z++) {
int index1 = x*ysize*zsize + y*zsize + z;
int index2 = x*ysize*outputZSize + y*outputZSize + z;
ASSERT_EQUAL_TOL(reference[index1].re, result[index2].x, 1e-3);
ASSERT_EQUAL_TOL(reference[index1].im, result[index2].y, 1e-3);
}
fftpack_destroy(plan); fftpack_destroy(plan);
// Perform a backward transform and see if we get the original values. // Perform a backward transform and see if we get the original values.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment