Cleaned up CUDA FFT code

82069311 · Peter Eastman · d3e91b15 · 82069311 · 82069311 · 82069311
Commit 82069311 authored Jun 02, 2015 by Peter Eastman
4 changed files
--- a/platforms/cuda/include/CudaKernels.h
+++ b/platforms/cuda/include/CudaKernels.h
@@ -665,7 +665,7 @@ private:
    std::vector<std::pair<int, int> > exceptionAtoms;
    double ewaldSelfEnergy, dispersionCoefficient, alpha;
    int interpolateForceThreads;
-    bool hasCoulomb, hasLJ, usePmeStream;
+    bool hasCoulomb, hasLJ, usePmeStream, useCudaFFT;
    static const int PmeOrder = 5;
 };

--- a/platforms/cuda/src/CudaFFT3D.cpp
+++ b/platforms/cuda/src/CudaFFT3D.cpp
@@ -168,14 +168,12 @@ static int getSmallestRadix(int size) {
 }
 CUfunction CudaFFT3D::createKernel(int xsize, int ysize, int zsize, int& threads, int axis, bool forward, bool inputIsReal) {
-    int maxThreads = 256;//std::min(256, (int) context.getDevice().getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>());
+    int maxThreads = 256;
 //    while (maxThreads > 128 && maxThreads-64 >= zsize)
 //        maxThreads -= 64;
    int threadsPerBlock = zsize/getSmallestRadix(zsize);
-    bool isCPU = false;//context.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU;
-    bool loopRequired = (threadsPerBlock > maxThreads || isCPU);
    stringstream source;
-    int blocksPerGroup = (loopRequired ? 1 : max(1, maxThreads/threadsPerBlock));
+    int blocksPerGroup = max(1, maxThreads/threadsPerBlock);
    int stage = 0;
    int L = zsize;
    int m = 1;
@@ -201,11 +199,6 @@ CUfunction CudaFFT3D::createKernel(int xsize, int ysize, int zsize, int& threads
        source<<"{\n";
        L = L/radix;
        source<<"// Pass "<<(stage+1)<<" (radix "<<radix<<")\n";
-        if (loopRequired) {
-            source<<"for (int i = threadIdx.x; i < "<<(L*m)<<"; i += blockDim.x) {\n";
-            source<<"int base = i;\n";
-        }
-        else {
        if (L*m < threadsPerBlock)
            source<<"if (threadIdx.x < "<<(blocksPerGroup*L*m)<<") {\n";
        else
@@ -213,7 +206,6 @@ CUfunction CudaFFT3D::createKernel(int xsize, int ysize, int zsize, int& threads
        source<<"int block = threadIdx.x/"<<(L*m)<<";\n";
        source<<"int i = threadIdx.x-block*"<<(L*m)<<";\n";
        source<<"int base = i+block*"<<zsize<<";\n";
-        }
        source<<"int j = i/"<<m<<";\n";
        if (radix == 7) {
            source<<"real2 c0 = data"<<input<<"[base];\n";
@@ -328,7 +320,6 @@ CUfunction CudaFFT3D::createKernel(int xsize, int ysize, int zsize, int& threads
    bool outputIsReal = (inputIsReal && axis == 2 && !forward);
    bool outputIsPacked = (inputIsReal && axis == 2 && forward);
    string outputSuffix = (outputIsReal ? ".x" : "");
-    if (loopRequired || true) {
    if (outputIsPacked)
        source<<"if (index < XSIZE*YSIZE && x < XSIZE/2+1)\n";
    else
@@ -338,17 +329,6 @@ CUfunction CudaFFT3D::createKernel(int xsize, int ysize, int zsize, int& threads
        source<<"out[y*(ZSIZE*(XSIZE/2+1))+i*(XSIZE/2+1)+x] = data"<<(stage%2)<<"[i+block*ZSIZE]"<<outputSuffix<<";\n";
    else
            source<<"out[y*(ZSIZE*XSIZE)+i*XSIZE+x] = data"<<(stage%2)<<"[i+block*ZSIZE]"<<outputSuffix<<";\n";
-    }
-    else {
-        if (outputIsPacked) {
-            source<<"if (index < XSIZE*YSIZE && x < XSIZE/2+1)\n";
-            source<<"out[y*(ZSIZE*(XSIZE/2+1))+(threadIdx.x%ZSIZE)*(XSIZE/2+1)+x] = data"<<(stage%2)<<"[threadIdx.x]"<<outputSuffix<<";\n";
-        }
-        else {
-            source<<"if (index < XSIZE*YSIZE)\n";
-            source<<"out[y*(ZSIZE*XSIZE)+(threadIdx.x%ZSIZE)*XSIZE+x] = data"<<(stage%2)<<"[threadIdx.x]"<<outputSuffix<<";\n";
-        }
-    }
    map<string, string> replacements;
    replacements["XSIZE"] = context.intToString(xsize);
    replacements["YSIZE"] = context.intToString(ysize);
@@ -357,7 +337,6 @@ CUfunction CudaFFT3D::createKernel(int xsize, int ysize, int zsize, int& threads
    replacements["THREADS_PER_BLOCK"] = context.intToString(threadsPerBlock);
    replacements["M_PI"] = context.doubleToString(M_PI);
    replacements["COMPUTE_FFT"] = source.str();
-    replacements["LOOP_REQUIRED"] = (loopRequired ? "1" : "0");
    replacements["SIGN"] = (forward ? "1" : "-1");
    replacements["INPUT_TYPE"] = (inputIsReal && axis == 0 && forward ? "real" : "real2");
    replacements["OUTPUT_TYPE"] = (outputIsReal ? "real" : "real2");
@@ -366,6 +345,6 @@ CUfunction CudaFFT3D::createKernel(int xsize, int ysize, int zsize, int& threads
    replacements["OUTPUT_IS_PACKED"] = (outputIsPacked ? "1" : "0");
    CUmodule module = context.createModule(CudaKernelSources::vectorOps+context.replaceStrings(CudaKernelSources::fft, replacements));
    CUfunction kernel = context.getKernel(module, "execFFT");
-    threads = (isCPU ? 1 : blocksPerGroup*threadsPerBlock);
+    threads = blocksPerGroup*threadsPerBlock;
    return kernel;
 }
--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
@@ -1502,8 +1502,10 @@ CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() {
    if (pmeio != NULL)
        delete pmeio;
    if (hasInitializedFFT) {
+        if (useCudaFFT) {
            cufftDestroy(fftForward);
            cufftDestroy(fftBackward);
+        }
        if (usePmeStream) {
            cuStreamDestroy(pmeStream);
            cuEventDestroy(pmeSyncEvent);
@@ -1694,39 +1696,40 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
                // Create required data structures.
                int elementSize = (cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
+                directPmeGrid = new CudaArray(cu, gridSizeX*gridSizeY*gridSizeZ, cu.getComputeCapability() >= 2.0 ? 2*elementSize : 2*sizeof(long long), "originalPmeGrid");
-                directPmeGrid = new CudaArray(cu, gridSizeX*gridSizeY*gridSizeZ, cu.getComputeCapability() >= 2.0 ? elementSize : sizeof(long long), "originalPmeGrid");
+                reciprocalPmeGrid = new CudaArray(cu, gridSizeX*gridSizeY*gridSizeZ, 2*elementSize, "reciprocalPmeGrid");
-                reciprocalPmeGrid = new CudaArray(cu, gridSizeX*gridSizeY*(gridSizeZ/2+1), 2*elementSize, "reciprocalPmeGrid");
                cu.addAutoclearBuffer(*directPmeGrid);
                pmeBsplineModuliX = new CudaArray(cu, gridSizeX, elementSize, "pmeBsplineModuliX");
                pmeBsplineModuliY = new CudaArray(cu, gridSizeY, elementSize, "pmeBsplineModuliY");
                pmeBsplineModuliZ = new CudaArray(cu, gridSizeZ, elementSize, "pmeBsplineModuliZ");
                pmeAtomRange = CudaArray::create<int>(cu, gridSizeX*gridSizeY*gridSizeZ+1, "pmeAtomRange");
                pmeAtomGridIndex = CudaArray::create<int2>(cu, numParticles, "pmeAtomGridIndex");
                sort = new CudaSort(cu, new SortTrait(), cu.getNumAtoms());
-                fft = new CudaFFT3D(cu, gridSizeX, gridSizeY, gridSizeZ, true);
+                useCudaFFT = false; // We might switch back in the future, once Nvidia has all their bugs worked out
+                if (useCudaFFT) {
                    cufftResult result = cufftPlan3d(&fftForward, gridSizeX, gridSizeY, gridSizeZ, cu.getUseDoublePrecision() ? CUFFT_D2Z : CUFFT_R2C);
                    if (result != CUFFT_SUCCESS)
                        throw OpenMMException("Error initializing FFT: "+cu.intToString(result));
                    result = cufftPlan3d(&fftBackward, gridSizeX, gridSizeY, gridSizeZ, cu.getUseDoublePrecision() ? CUFFT_Z2D : CUFFT_C2R);
                    if (result != CUFFT_SUCCESS)
                        throw OpenMMException("Error initializing FFT: "+cu.intToString(result));
                    cufftSetCompatibilityMode(fftForward, CUFFT_COMPATIBILITY_NATIVE);
                    cufftSetCompatibilityMode(fftBackward, CUFFT_COMPATIBILITY_NATIVE);
+                }
+                else
+                    fft = new CudaFFT3D(cu, gridSizeX, gridSizeY, gridSizeZ, true);
                // Prepare for doing PME on its own stream.
-                int cufftVersion;
+                char deviceName[100];
-                cufftGetVersion(&cufftVersion);
+                cuDeviceGetName(deviceName, 100, cu.getDevice());
-                usePmeStream = true;//(cu.getComputeCapability() < 5.0 && numParticles < 130000 && cufftVersion >= 6000 && cufftVersion != 7000); // Workarounds for various CUDA bugs
+                usePmeStream = (string(deviceName) != "GeForce GTX 980"); // Using a separate stream is slower on GTX 980
                if (usePmeStream) {
                    cuStreamCreate(&pmeStream, CU_STREAM_NON_BLOCKING);
+                    if (useCudaFFT) {
                        cufftSetStream(fftForward, pmeStream);
                        cufftSetStream(fftBackward, pmeStream);
+                    }
                    CHECK_RESULT(cuEventCreate(&pmeSyncEvent, CU_EVENT_DISABLE_TIMING), "Error creating event for NonbondedForce");
                    int recipForceGroup = force.getReciprocalSpaceForceGroup();
                    if (recipForceGroup < 0)
@@ -1896,11 +1899,15 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
            cu.executeKernel(pmeFinishSpreadChargeKernel, finishSpreadArgs, directPmeGrid->getSize());
        }
-//        if (cu.getUseDoublePrecision())
+        if (useCudaFFT) {
-//            cufftExecD2Z(fftForward, (double*) directPmeGrid->getDevicePointer(), (double2*) reciprocalPmeGrid->getDevicePointer());
+            if (cu.getUseDoublePrecision())
-//        else
+                cufftExecD2Z(fftForward, (double*) directPmeGrid->getDevicePointer(), (double2*) reciprocalPmeGrid->getDevicePointer());
-//            cufftExecR2C(fftForward, (float*) directPmeGrid->getDevicePointer(), (float2*) reciprocalPmeGrid->getDevicePointer());
+            else
+                cufftExecR2C(fftForward, (float*) directPmeGrid->getDevicePointer(), (float2*) reciprocalPmeGrid->getDevicePointer());
+        }
+        else {
            fft->execFFT(*directPmeGrid, *reciprocalPmeGrid, true);
+        }
        if (includeEnergy) {
            void* computeEnergyArgs[] = {&reciprocalPmeGrid->getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(),
@@ -1914,12 +1921,15 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
                cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
        cu.executeKernel(pmeConvolutionKernel, convolutionArgs, cu.getNumAtoms());
-//        if (cu.getUseDoublePrecision())
+        if (useCudaFFT) {
-//            cufftExecZ2D(fftBackward, (double2*) reciprocalPmeGrid->getDevicePointer(), (double*) directPmeGrid->getDevicePointer());
+            if (cu.getUseDoublePrecision())
-//        else
+                cufftExecZ2D(fftBackward, (double2*) reciprocalPmeGrid->getDevicePointer(), (double*) directPmeGrid->getDevicePointer());
-//            cufftExecC2R(fftBackward, (float2*) reciprocalPmeGrid->getDevicePointer(), (float*)  directPmeGrid->getDevicePointer());
+            else
+                cufftExecC2R(fftBackward, (float2*) reciprocalPmeGrid->getDevicePointer(), (float*)  directPmeGrid->getDevicePointer());
+        }
+        else {
            fft->execFFT(*reciprocalPmeGrid, *directPmeGrid, false);
+        }
        void* interpolateArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &directPmeGrid->getDevicePointer(),
                cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex->getDevicePointer()};

--- a/platforms/cuda/src/kernels/fft.cu
+++ b/platforms/cuda/src/kernels/fft.cu
@@ -35,7 +35,6 @@ extern "C" __global__ void execFFT(const INPUT_TYPE* __restrict__ in, OUTPUT_TYP
 #if OUTPUT_IS_PACKED
        if (x < XSIZE/2+1) {
 #endif
-//#if LOOP_REQUIRED
        if (index < XSIZE*YSIZE)
            for (int i = threadIdx.x-block*THREADS_PER_BLOCK; i < ZSIZE; i += THREADS_PER_BLOCK)
    #if INPUT_IS_REAL
@@ -45,16 +44,6 @@ extern "C" __global__ void execFFT(const INPUT_TYPE* __restrict__ in, OUTPUT_TYP
    #else
                data0[i+block*ZSIZE] = in[x*(YSIZE*ZSIZE)+y*ZSIZE+i];
    #endif
-//#else
-//        if (index < XSIZE*YSIZE && (threadIdx.x%BLOCK_SIZE) < ZSIZE)
-//    #if INPUT_IS_REAL
-//            data0[threadIdx.x] = make_real2(in[x*(YSIZE*ZSIZE)+y*ZSIZE+threadIdx.x%BLOCK_SIZE], 0);
-//    #elif INPUT_IS_PACKED
-//            data0[threadIdx.x] = loadComplexValue(in, x, y, threadIdx.x%BLOCK_SIZE);
-//    #else
-//            data0[threadIdx.x] = in[x*(YSIZE*ZSIZE)+y*ZSIZE+threadIdx.x%BLOCK_SIZE];
-//    #endif
-//#endif
 #if OUTPUT_IS_PACKED
        }
 #endif