Commit 82069311 authored by Peter Eastman's avatar Peter Eastman
Browse files

Cleaned up CUDA FFT code

parent d3e91b15
...@@ -665,7 +665,7 @@ private: ...@@ -665,7 +665,7 @@ private:
std::vector<std::pair<int, int> > exceptionAtoms; std::vector<std::pair<int, int> > exceptionAtoms;
double ewaldSelfEnergy, dispersionCoefficient, alpha; double ewaldSelfEnergy, dispersionCoefficient, alpha;
int interpolateForceThreads; int interpolateForceThreads;
bool hasCoulomb, hasLJ, usePmeStream; bool hasCoulomb, hasLJ, usePmeStream, useCudaFFT;
static const int PmeOrder = 5; static const int PmeOrder = 5;
}; };
......
...@@ -168,14 +168,12 @@ static int getSmallestRadix(int size) { ...@@ -168,14 +168,12 @@ static int getSmallestRadix(int size) {
} }
CUfunction CudaFFT3D::createKernel(int xsize, int ysize, int zsize, int& threads, int axis, bool forward, bool inputIsReal) { CUfunction CudaFFT3D::createKernel(int xsize, int ysize, int zsize, int& threads, int axis, bool forward, bool inputIsReal) {
int maxThreads = 256;//std::min(256, (int) context.getDevice().getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>()); int maxThreads = 256;
// while (maxThreads > 128 && maxThreads-64 >= zsize) // while (maxThreads > 128 && maxThreads-64 >= zsize)
// maxThreads -= 64; // maxThreads -= 64;
int threadsPerBlock = zsize/getSmallestRadix(zsize); int threadsPerBlock = zsize/getSmallestRadix(zsize);
bool isCPU = false;//context.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU;
bool loopRequired = (threadsPerBlock > maxThreads || isCPU);
stringstream source; stringstream source;
int blocksPerGroup = (loopRequired ? 1 : max(1, maxThreads/threadsPerBlock)); int blocksPerGroup = max(1, maxThreads/threadsPerBlock);
int stage = 0; int stage = 0;
int L = zsize; int L = zsize;
int m = 1; int m = 1;
...@@ -201,19 +199,13 @@ CUfunction CudaFFT3D::createKernel(int xsize, int ysize, int zsize, int& threads ...@@ -201,19 +199,13 @@ CUfunction CudaFFT3D::createKernel(int xsize, int ysize, int zsize, int& threads
source<<"{\n"; source<<"{\n";
L = L/radix; L = L/radix;
source<<"// Pass "<<(stage+1)<<" (radix "<<radix<<")\n"; source<<"// Pass "<<(stage+1)<<" (radix "<<radix<<")\n";
if (loopRequired) { if (L*m < threadsPerBlock)
source<<"for (int i = threadIdx.x; i < "<<(L*m)<<"; i += blockDim.x) {\n"; source<<"if (threadIdx.x < "<<(blocksPerGroup*L*m)<<") {\n";
source<<"int base = i;\n"; else
} source<<"{\n";
else { source<<"int block = threadIdx.x/"<<(L*m)<<";\n";
if (L*m < threadsPerBlock) source<<"int i = threadIdx.x-block*"<<(L*m)<<";\n";
source<<"if (threadIdx.x < "<<(blocksPerGroup*L*m)<<") {\n"; source<<"int base = i+block*"<<zsize<<";\n";
else
source<<"{\n";
source<<"int block = threadIdx.x/"<<(L*m)<<";\n";
source<<"int i = threadIdx.x-block*"<<(L*m)<<";\n";
source<<"int base = i+block*"<<zsize<<";\n";
}
source<<"int j = i/"<<m<<";\n"; source<<"int j = i/"<<m<<";\n";
if (radix == 7) { if (radix == 7) {
source<<"real2 c0 = data"<<input<<"[base];\n"; source<<"real2 c0 = data"<<input<<"[base];\n";
...@@ -328,27 +320,15 @@ CUfunction CudaFFT3D::createKernel(int xsize, int ysize, int zsize, int& threads ...@@ -328,27 +320,15 @@ CUfunction CudaFFT3D::createKernel(int xsize, int ysize, int zsize, int& threads
bool outputIsReal = (inputIsReal && axis == 2 && !forward); bool outputIsReal = (inputIsReal && axis == 2 && !forward);
bool outputIsPacked = (inputIsReal && axis == 2 && forward); bool outputIsPacked = (inputIsReal && axis == 2 && forward);
string outputSuffix = (outputIsReal ? ".x" : ""); string outputSuffix = (outputIsReal ? ".x" : "");
if (loopRequired || true) { if (outputIsPacked)
if (outputIsPacked) source<<"if (index < XSIZE*YSIZE && x < XSIZE/2+1)\n";
source<<"if (index < XSIZE*YSIZE && x < XSIZE/2+1)\n"; else
else source<<"if (index < XSIZE*YSIZE)\n";
source<<"if (index < XSIZE*YSIZE)\n"; source<<"for (int i = threadIdx.x-block*THREADS_PER_BLOCK; i < ZSIZE; i += THREADS_PER_BLOCK)\n";
source<<"for (int i = threadIdx.x-block*THREADS_PER_BLOCK; i < ZSIZE; i += THREADS_PER_BLOCK)\n"; if (outputIsPacked)
if (outputIsPacked) source<<"out[y*(ZSIZE*(XSIZE/2+1))+i*(XSIZE/2+1)+x] = data"<<(stage%2)<<"[i+block*ZSIZE]"<<outputSuffix<<";\n";
source<<"out[y*(ZSIZE*(XSIZE/2+1))+i*(XSIZE/2+1)+x] = data"<<(stage%2)<<"[i+block*ZSIZE]"<<outputSuffix<<";\n"; else
else
source<<"out[y*(ZSIZE*XSIZE)+i*XSIZE+x] = data"<<(stage%2)<<"[i+block*ZSIZE]"<<outputSuffix<<";\n"; source<<"out[y*(ZSIZE*XSIZE)+i*XSIZE+x] = data"<<(stage%2)<<"[i+block*ZSIZE]"<<outputSuffix<<";\n";
}
else {
if (outputIsPacked) {
source<<"if (index < XSIZE*YSIZE && x < XSIZE/2+1)\n";
source<<"out[y*(ZSIZE*(XSIZE/2+1))+(threadIdx.x%ZSIZE)*(XSIZE/2+1)+x] = data"<<(stage%2)<<"[threadIdx.x]"<<outputSuffix<<";\n";
}
else {
source<<"if (index < XSIZE*YSIZE)\n";
source<<"out[y*(ZSIZE*XSIZE)+(threadIdx.x%ZSIZE)*XSIZE+x] = data"<<(stage%2)<<"[threadIdx.x]"<<outputSuffix<<";\n";
}
}
map<string, string> replacements; map<string, string> replacements;
replacements["XSIZE"] = context.intToString(xsize); replacements["XSIZE"] = context.intToString(xsize);
replacements["YSIZE"] = context.intToString(ysize); replacements["YSIZE"] = context.intToString(ysize);
...@@ -357,7 +337,6 @@ CUfunction CudaFFT3D::createKernel(int xsize, int ysize, int zsize, int& threads ...@@ -357,7 +337,6 @@ CUfunction CudaFFT3D::createKernel(int xsize, int ysize, int zsize, int& threads
replacements["THREADS_PER_BLOCK"] = context.intToString(threadsPerBlock); replacements["THREADS_PER_BLOCK"] = context.intToString(threadsPerBlock);
replacements["M_PI"] = context.doubleToString(M_PI); replacements["M_PI"] = context.doubleToString(M_PI);
replacements["COMPUTE_FFT"] = source.str(); replacements["COMPUTE_FFT"] = source.str();
replacements["LOOP_REQUIRED"] = (loopRequired ? "1" : "0");
replacements["SIGN"] = (forward ? "1" : "-1"); replacements["SIGN"] = (forward ? "1" : "-1");
replacements["INPUT_TYPE"] = (inputIsReal && axis == 0 && forward ? "real" : "real2"); replacements["INPUT_TYPE"] = (inputIsReal && axis == 0 && forward ? "real" : "real2");
replacements["OUTPUT_TYPE"] = (outputIsReal ? "real" : "real2"); replacements["OUTPUT_TYPE"] = (outputIsReal ? "real" : "real2");
...@@ -366,6 +345,6 @@ CUfunction CudaFFT3D::createKernel(int xsize, int ysize, int zsize, int& threads ...@@ -366,6 +345,6 @@ CUfunction CudaFFT3D::createKernel(int xsize, int ysize, int zsize, int& threads
replacements["OUTPUT_IS_PACKED"] = (outputIsPacked ? "1" : "0"); replacements["OUTPUT_IS_PACKED"] = (outputIsPacked ? "1" : "0");
CUmodule module = context.createModule(CudaKernelSources::vectorOps+context.replaceStrings(CudaKernelSources::fft, replacements)); CUmodule module = context.createModule(CudaKernelSources::vectorOps+context.replaceStrings(CudaKernelSources::fft, replacements));
CUfunction kernel = context.getKernel(module, "execFFT"); CUfunction kernel = context.getKernel(module, "execFFT");
threads = (isCPU ? 1 : blocksPerGroup*threadsPerBlock); threads = blocksPerGroup*threadsPerBlock;
return kernel; return kernel;
} }
...@@ -1502,8 +1502,10 @@ CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() { ...@@ -1502,8 +1502,10 @@ CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() {
if (pmeio != NULL) if (pmeio != NULL)
delete pmeio; delete pmeio;
if (hasInitializedFFT) { if (hasInitializedFFT) {
cufftDestroy(fftForward); if (useCudaFFT) {
cufftDestroy(fftBackward); cufftDestroy(fftForward);
cufftDestroy(fftBackward);
}
if (usePmeStream) { if (usePmeStream) {
cuStreamDestroy(pmeStream); cuStreamDestroy(pmeStream);
cuEventDestroy(pmeSyncEvent); cuEventDestroy(pmeSyncEvent);
...@@ -1694,39 +1696,40 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon ...@@ -1694,39 +1696,40 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
// Create required data structures. // Create required data structures.
int elementSize = (cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float)); int elementSize = (cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
directPmeGrid = new CudaArray(cu, gridSizeX*gridSizeY*gridSizeZ, cu.getComputeCapability() >= 2.0 ? 2*elementSize : 2*sizeof(long long), "originalPmeGrid");
directPmeGrid = new CudaArray(cu, gridSizeX*gridSizeY*gridSizeZ, cu.getComputeCapability() >= 2.0 ? elementSize : sizeof(long long), "originalPmeGrid"); reciprocalPmeGrid = new CudaArray(cu, gridSizeX*gridSizeY*gridSizeZ, 2*elementSize, "reciprocalPmeGrid");
reciprocalPmeGrid = new CudaArray(cu, gridSizeX*gridSizeY*(gridSizeZ/2+1), 2*elementSize, "reciprocalPmeGrid");
cu.addAutoclearBuffer(*directPmeGrid); cu.addAutoclearBuffer(*directPmeGrid);
pmeBsplineModuliX = new CudaArray(cu, gridSizeX, elementSize, "pmeBsplineModuliX"); pmeBsplineModuliX = new CudaArray(cu, gridSizeX, elementSize, "pmeBsplineModuliX");
pmeBsplineModuliY = new CudaArray(cu, gridSizeY, elementSize, "pmeBsplineModuliY"); pmeBsplineModuliY = new CudaArray(cu, gridSizeY, elementSize, "pmeBsplineModuliY");
pmeBsplineModuliZ = new CudaArray(cu, gridSizeZ, elementSize, "pmeBsplineModuliZ"); pmeBsplineModuliZ = new CudaArray(cu, gridSizeZ, elementSize, "pmeBsplineModuliZ");
pmeAtomRange = CudaArray::create<int>(cu, gridSizeX*gridSizeY*gridSizeZ+1, "pmeAtomRange"); pmeAtomRange = CudaArray::create<int>(cu, gridSizeX*gridSizeY*gridSizeZ+1, "pmeAtomRange");
pmeAtomGridIndex = CudaArray::create<int2>(cu, numParticles, "pmeAtomGridIndex"); pmeAtomGridIndex = CudaArray::create<int2>(cu, numParticles, "pmeAtomGridIndex");
sort = new CudaSort(cu, new SortTrait(), cu.getNumAtoms()); sort = new CudaSort(cu, new SortTrait(), cu.getNumAtoms());
fft = new CudaFFT3D(cu, gridSizeX, gridSizeY, gridSizeZ, true); useCudaFFT = false; // We might switch back in the future, once Nvidia has all their bugs worked out
if (useCudaFFT) {
cufftResult result = cufftPlan3d(&fftForward, gridSizeX, gridSizeY, gridSizeZ, cu.getUseDoublePrecision() ? CUFFT_D2Z : CUFFT_R2C); cufftResult result = cufftPlan3d(&fftForward, gridSizeX, gridSizeY, gridSizeZ, cu.getUseDoublePrecision() ? CUFFT_D2Z : CUFFT_R2C);
if (result != CUFFT_SUCCESS) if (result != CUFFT_SUCCESS)
throw OpenMMException("Error initializing FFT: "+cu.intToString(result)); throw OpenMMException("Error initializing FFT: "+cu.intToString(result));
result = cufftPlan3d(&fftBackward, gridSizeX, gridSizeY, gridSizeZ, cu.getUseDoublePrecision() ? CUFFT_Z2D : CUFFT_C2R); result = cufftPlan3d(&fftBackward, gridSizeX, gridSizeY, gridSizeZ, cu.getUseDoublePrecision() ? CUFFT_Z2D : CUFFT_C2R);
if (result != CUFFT_SUCCESS) if (result != CUFFT_SUCCESS)
throw OpenMMException("Error initializing FFT: "+cu.intToString(result)); throw OpenMMException("Error initializing FFT: "+cu.intToString(result));
cufftSetCompatibilityMode(fftForward, CUFFT_COMPATIBILITY_NATIVE);
cufftSetCompatibilityMode(fftForward, CUFFT_COMPATIBILITY_NATIVE); cufftSetCompatibilityMode(fftBackward, CUFFT_COMPATIBILITY_NATIVE);
cufftSetCompatibilityMode(fftBackward, CUFFT_COMPATIBILITY_NATIVE); }
else
fft = new CudaFFT3D(cu, gridSizeX, gridSizeY, gridSizeZ, true);
// Prepare for doing PME on its own stream. // Prepare for doing PME on its own stream.
int cufftVersion; char deviceName[100];
cufftGetVersion(&cufftVersion); cuDeviceGetName(deviceName, 100, cu.getDevice());
usePmeStream = true;//(cu.getComputeCapability() < 5.0 && numParticles < 130000 && cufftVersion >= 6000 && cufftVersion != 7000); // Workarounds for various CUDA bugs usePmeStream = (string(deviceName) != "GeForce GTX 980"); // Using a separate stream is slower on GTX 980
if (usePmeStream) { if (usePmeStream) {
cuStreamCreate(&pmeStream, CU_STREAM_NON_BLOCKING); cuStreamCreate(&pmeStream, CU_STREAM_NON_BLOCKING);
cufftSetStream(fftForward, pmeStream); if (useCudaFFT) {
cufftSetStream(fftBackward, pmeStream); cufftSetStream(fftForward, pmeStream);
cufftSetStream(fftBackward, pmeStream);
}
CHECK_RESULT(cuEventCreate(&pmeSyncEvent, CU_EVENT_DISABLE_TIMING), "Error creating event for NonbondedForce"); CHECK_RESULT(cuEventCreate(&pmeSyncEvent, CU_EVENT_DISABLE_TIMING), "Error creating event for NonbondedForce");
int recipForceGroup = force.getReciprocalSpaceForceGroup(); int recipForceGroup = force.getReciprocalSpaceForceGroup();
if (recipForceGroup < 0) if (recipForceGroup < 0)
...@@ -1896,11 +1899,15 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF ...@@ -1896,11 +1899,15 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
cu.executeKernel(pmeFinishSpreadChargeKernel, finishSpreadArgs, directPmeGrid->getSize()); cu.executeKernel(pmeFinishSpreadChargeKernel, finishSpreadArgs, directPmeGrid->getSize());
} }
// if (cu.getUseDoublePrecision()) if (useCudaFFT) {
// cufftExecD2Z(fftForward, (double*) directPmeGrid->getDevicePointer(), (double2*) reciprocalPmeGrid->getDevicePointer()); if (cu.getUseDoublePrecision())
// else cufftExecD2Z(fftForward, (double*) directPmeGrid->getDevicePointer(), (double2*) reciprocalPmeGrid->getDevicePointer());
// cufftExecR2C(fftForward, (float*) directPmeGrid->getDevicePointer(), (float2*) reciprocalPmeGrid->getDevicePointer()); else
fft->execFFT(*directPmeGrid, *reciprocalPmeGrid, true); cufftExecR2C(fftForward, (float*) directPmeGrid->getDevicePointer(), (float2*) reciprocalPmeGrid->getDevicePointer());
}
else {
fft->execFFT(*directPmeGrid, *reciprocalPmeGrid, true);
}
if (includeEnergy) { if (includeEnergy) {
void* computeEnergyArgs[] = {&reciprocalPmeGrid->getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(), void* computeEnergyArgs[] = {&reciprocalPmeGrid->getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(),
...@@ -1914,12 +1921,15 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF ...@@ -1914,12 +1921,15 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]}; cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeConvolutionKernel, convolutionArgs, cu.getNumAtoms()); cu.executeKernel(pmeConvolutionKernel, convolutionArgs, cu.getNumAtoms());
// if (cu.getUseDoublePrecision()) if (useCudaFFT) {
// cufftExecZ2D(fftBackward, (double2*) reciprocalPmeGrid->getDevicePointer(), (double*) directPmeGrid->getDevicePointer()); if (cu.getUseDoublePrecision())
// else cufftExecZ2D(fftBackward, (double2*) reciprocalPmeGrid->getDevicePointer(), (double*) directPmeGrid->getDevicePointer());
// cufftExecC2R(fftBackward, (float2*) reciprocalPmeGrid->getDevicePointer(), (float*) directPmeGrid->getDevicePointer()); else
fft->execFFT(*reciprocalPmeGrid, *directPmeGrid, false); cufftExecC2R(fftBackward, (float2*) reciprocalPmeGrid->getDevicePointer(), (float*) directPmeGrid->getDevicePointer());
}
else {
fft->execFFT(*reciprocalPmeGrid, *directPmeGrid, false);
}
void* interpolateArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &directPmeGrid->getDevicePointer(), void* interpolateArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &directPmeGrid->getDevicePointer(),
cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex->getDevicePointer()}; cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex->getDevicePointer()};
......
...@@ -35,26 +35,15 @@ extern "C" __global__ void execFFT(const INPUT_TYPE* __restrict__ in, OUTPUT_TYP ...@@ -35,26 +35,15 @@ extern "C" __global__ void execFFT(const INPUT_TYPE* __restrict__ in, OUTPUT_TYP
#if OUTPUT_IS_PACKED #if OUTPUT_IS_PACKED
if (x < XSIZE/2+1) { if (x < XSIZE/2+1) {
#endif #endif
//#if LOOP_REQUIRED
if (index < XSIZE*YSIZE) if (index < XSIZE*YSIZE)
for (int i = threadIdx.x-block*THREADS_PER_BLOCK; i < ZSIZE; i += THREADS_PER_BLOCK) for (int i = threadIdx.x-block*THREADS_PER_BLOCK; i < ZSIZE; i += THREADS_PER_BLOCK)
#if INPUT_IS_REAL #if INPUT_IS_REAL
data0[i+block*ZSIZE] = make_real2(in[x*(YSIZE*ZSIZE)+y*ZSIZE+i], 0); data0[i+block*ZSIZE] = make_real2(in[x*(YSIZE*ZSIZE)+y*ZSIZE+i], 0);
#elif INPUT_IS_PACKED #elif INPUT_IS_PACKED
data0[i+block*ZSIZE] = loadComplexValue(in, x, y, i); data0[i+block*ZSIZE] = loadComplexValue(in, x, y, i);
#else #else
data0[i+block*ZSIZE] = in[x*(YSIZE*ZSIZE)+y*ZSIZE+i]; data0[i+block*ZSIZE] = in[x*(YSIZE*ZSIZE)+y*ZSIZE+i];
#endif #endif
//#else
// if (index < XSIZE*YSIZE && (threadIdx.x%BLOCK_SIZE) < ZSIZE)
// #if INPUT_IS_REAL
// data0[threadIdx.x] = make_real2(in[x*(YSIZE*ZSIZE)+y*ZSIZE+threadIdx.x%BLOCK_SIZE], 0);
// #elif INPUT_IS_PACKED
// data0[threadIdx.x] = loadComplexValue(in, x, y, threadIdx.x%BLOCK_SIZE);
// #else
// data0[threadIdx.x] = in[x*(YSIZE*ZSIZE)+y*ZSIZE+threadIdx.x%BLOCK_SIZE];
// #endif
//#endif
#if OUTPUT_IS_PACKED #if OUTPUT_IS_PACKED
} }
#endif #endif
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment