Optimizations to PME.

dc677fac · Peter Eastman · 24148147 · dc677fac · dc677fac · dc677fac
Commit dc677fac authored Sep 08, 2009 by Peter Eastman
9 changed files
--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
@@ -308,10 +308,10 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
            double my = boxVectors[1][1]/force.getCutoffDistance();
            double mz = boxVectors[2][2]/force.getCutoffDistance();
            double pi = 3.1415926535897932385;
+            int kmaxx = (int)std::ceil(-(mx/pi)*std::log(ewaldErrorTol));
+            int kmaxy = (int)std::ceil(-(my/pi)*std::log(ewaldErrorTol));
+            int kmaxz = (int)std::ceil(-(mz/pi)*std::log(ewaldErrorTol));
            if (force.getNonbondedMethod() == NonbondedForce::Ewald) {
-                int kmaxx = (int)std::ceil(-(mx/pi)*std::log(ewaldErrorTol));
-                int kmaxy = (int)std::ceil(-(my/pi)*std::log(ewaldErrorTol));
-                int kmaxz = (int)std::ceil(-(mz/pi)*std::log(ewaldErrorTol));
                if (kmaxx%2 == 0)
                    kmaxx++;
                if (kmaxy%2 == 0)
@@ -322,7 +322,13 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
                method = EWALD;
            }
            else {
-                gpuSetPMEParameters(gpu, (float) alpha);
+                int gridSizeX = kmaxx*3;
+                int gridSizeY = kmaxy*3;
+                int gridSizeZ = kmaxz*3;
+                gridSizeX = ((gridSizeX+3)/4)*4;
+                gridSizeY = ((gridSizeY+3)/4)*4;
+                gridSizeZ = ((gridSizeZ+3)/4)*4;
+                gpuSetPMEParameters(gpu, (float) alpha, gridSizeX, gridSizeY, gridSizeZ);
                method = PARTICLE_MESH_EWALD;
            }
        }

--- a/platforms/cuda/src/kernels/cudatypes.h
+++ b/platforms/cuda/src/kernels/cudatypes.h
@@ -358,6 +358,9 @@ struct cudaGmxSimulation {
    float4*         pTabulatedFunctionCoefficients[MAX_TABULATED_FUNCTIONS]; // The spline coefficients for each tabulated function
    float4*         pTabulatedFunctionParams;       // The min, max, and spacing for each tabulated function
    float2*         pEwaldCosSinSum;                // Pointer to the cos/sin sums (ewald)
+    float*          pTabulatedErfc;                 // Tabulated values for erfc()
+    int             tabulatedErfcSize;              // The number of tabulated values for erfc()
+    float           tabulatedErfcScale;             // Scale factor for the argument to erfc()
    int3            pmeGridSize;                    // The dimensions of the grid for particle mesh Ewald
    int3            pmeGroupSize;                   // The dimensions of the groups used in charge spreading for PME
    cufftComplex*   pPmeGrid;                       // Grid points for particle mesh Ewald
@@ -366,7 +369,6 @@ struct cudaGmxSimulation {
    float4*         pPmeBsplineDtheta;
    int4*           pPmeParticleIndex;              // The grid indices for each atom
    float4*         pPmeParticleFraction;           // Fractional offset in the grid for each atom in all three dimensions.
-    int*            pPmeInteractionFlags;           // Flags for which groups of grid points interact with which atoms
    int*            pPmeAtomRange;                  // The range of sorted atoms at each grid point
    float2*         pPmeAtomGridIndex;              // The grid point each atom is at
    unsigned int    bonds;                          // Number of bonds

--- a/platforms/cuda/src/kernels/gpu.cpp
+++ b/platforms/cuda/src/kernels/gpu.cpp
@@ -747,6 +747,18 @@ void gpuSetCustomNonbondedParameters(gpuContext gpu, const vector<vector<double>
    delete fp;
 }
+static void tabulateErfc(gpuContext gpu)
+{
+    int tableSize = 2048;
+    gpu->sim.tabulatedErfcSize = tableSize;
+    gpu->sim.tabulatedErfcScale = tableSize/(gpu->sim.alphaEwald*gpu->sim.nonbondedCutoff);
+    gpu->psTabulatedErfc = new CUDAStream<float>(tableSize, 1, "TabulatedErfc");
+    gpu->sim.pTabulatedErfc = gpu->psTabulatedErfc->_pDevData;
+    for (int i = 0; i < tableSize; ++i)
+        (*gpu->psTabulatedErfc)[i] = erfc(i*(gpu->sim.alphaEwald*gpu->sim.nonbondedCutoff)/tableSize);
+    gpu->psTabulatedErfc->Upload();
+}
 extern "C"
 void gpuSetEwaldParameters(gpuContext gpu, float alpha, int kmaxx, int kmaxy, int kmaxz)
 {
@@ -757,13 +769,14 @@ void gpuSetEwaldParameters(gpuContext gpu, float alpha, int kmaxx, int kmaxy, in
    gpu->sim.kmaxZ              = kmaxz;
    gpu->psEwaldCosSinSum       = new CUDAStream<float2>((gpu->sim.kmaxX*2-1) * (gpu->sim.kmaxY*2-1) * (gpu->sim.kmaxZ*2-1), 1, "EwaldCosSinSum");
    gpu->sim.pEwaldCosSinSum    = gpu->psEwaldCosSinSum->_pDevStream[0];
+    tabulateErfc(gpu);
 }
 extern "C"
-void gpuSetPMEParameters(gpuContext gpu, float alpha)
+void gpuSetPMEParameters(gpuContext gpu, float alpha, int gridSizeX, int gridSizeY, int gridSizeZ)
 {
    gpu->sim.alphaEwald         = alpha;
-    int3 gridSize = make_int3(32, 32, 32);
+    int3 gridSize = make_int3(gridSizeX, gridSizeY, gridSizeZ);
    gpu->sim.pmeGridSize = gridSize;
    int3 groupSize = make_int3(2, 4, 4);
    gpu->sim.pmeGroupSize = groupSize;
@@ -786,12 +799,11 @@ void gpuSetPMEParameters(gpuContext gpu, float alpha)
    gpu->sim.pPmeParticleIndex = gpu->psPmeParticleIndex->_pDevData;
    gpu->psPmeParticleFraction = new CUDAStream<float4>(gpu->natoms, 1, "PmeParticleFraction");
    gpu->sim.pPmeParticleFraction = gpu->psPmeParticleFraction->_pDevData;
-    gpu->psPmeInteractionFlags = new CUDAStream<int>(totalGroups*(gpu->sim.paddedNumberOfAtoms/32), 1, "PmeInteractionFlags");
-    gpu->sim.pPmeInteractionFlags = gpu->psPmeInteractionFlags->_pDevData;
    gpu->psPmeAtomRange = new CUDAStream<int>(gridSize.x*gridSize.y*gridSize.z+1, 1, "PmeAtomRange");
    gpu->sim.pPmeAtomRange = gpu->psPmeAtomRange->_pDevData;
    gpu->psPmeAtomGridIndex = new CUDAStream<float2>(gpu->natoms, 1, "PmeAtomGridIndex");
    gpu->sim.pPmeAtomGridIndex = gpu->psPmeAtomGridIndex->_pDevData;
+    tabulateErfc(gpu);
    // Initialize the b-spline moduli.
@@ -1684,6 +1696,7 @@ void* gpuInit(int numAtoms, unsigned int device, bool useBlockingSync)
    gpu->psCustomExceptionID        = NULL;
    gpu->psCustomExceptionParams    = NULL;
    gpu->psEwaldCosSinSum           = NULL;
+    gpu->psTabulatedErfc            = NULL;
    gpu->psPmeGrid                  = NULL;
    gpu->psPmeBsplineModuli[0]      = NULL;
    gpu->psPmeBsplineModuli[1]      = NULL;
@@ -1863,6 +1876,7 @@ void gpuShutDown(gpuContext gpu)
        delete gpu->psPmeParticleFraction;
        delete gpu->psPmeAtomRange;
        delete gpu->psPmeAtomGridIndex;
+        delete gpu->psTabulatedErfc;
        cufftDestroy(gpu->fftplan);
    }
    delete gpu->psObcData;

--- a/platforms/cuda/src/kernels/gputypes.h
+++ b/platforms/cuda/src/kernels/gputypes.h
@@ -106,13 +106,13 @@ struct _gpuContext {
    CUDAStream<float4>* psCustomExceptionParams; // Parameters for custom nonbonded exceptions
    CUDAStream<float4>* psTabulatedFunctionParams; // The min, max, and spacing for each tabulated function
    CUDAStream<float2>* psEwaldCosSinSum;
+    CUDAStream<float>* psTabulatedErfc;     // Tabulated values for erfc()
    CUDAStream<cufftComplex>* psPmeGrid;    // Grid points for particle mesh Ewald
    CUDAStream<float>* psPmeBsplineModuli[3];
    CUDAStream<float4>* psPmeBsplineTheta;
    CUDAStream<float4>* psPmeBsplineDtheta;
    CUDAStream<int4>* psPmeParticleIndex;      // The grid indices for each atom
    CUDAStream<float4>* psPmeParticleFraction; // Fractional offset in the grid for each atom in all three dimensions.
-    CUDAStream<int>* psPmeInteractionFlags;    // Flags for which groups of grid points interact with which atoms
    CUDAStream<int>* psPmeAtomRange;           // The range of sorted atoms at each grid point
    CUDAStream<float2>* psPmeAtomGridIndex;    // The grid point each atom is at
    CUDAStream<float2>* psObcData;
@@ -217,7 +217,7 @@ extern "C"
 void gpuSetEwaldParameters(gpuContext gpu, float alpha, int kmaxx, int kmaxy, int kmaxz);
 extern "C"
-void gpuSetPMEParameters(gpuContext gpu, float alpha);
+void gpuSetPMEParameters(gpuContext gpu, float alpha, int gridSizeX, int gridSizeY, int gridSizeZ);
 extern "C"
 void gpuSetPeriodicBoxSize(gpuContext gpu, float xsize, float ysize, float zsize);

--- a/platforms/cuda/src/kernels/kCalculateCDLJForces.cu
+++ b/platforms/cuda/src/kernels/kCalculateCDLJForces.cu
@@ -67,6 +67,17 @@ void GetCalculateCDLJForcesSim(gpuContext gpu)
    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
 }
+texture<float, 1, cudaReadModeElementType> tabulatedErfcRef;
+__device__ float fastErfc(float r)
+{
+    float normalized = cSim.tabulatedErfcScale*r;
+    int index = (int) normalized;
+    float fract2 = normalized-index;
+    float fract1 = 1.0f-fract2;
+    return fract1*tex1Dfetch(tabulatedErfcRef, index) + fract2*tex1Dfetch(tabulatedErfcRef, index+1);
+}
 // Include versions of the kernels for N^2 calculations.
 #define METHOD_NAME(a, b) a##N2##b
@@ -167,27 +178,6 @@ void kCalculateCDLJForces(gpuContext gpu)
            LAUNCHERROR("kCalculateCDLJPeriodicForces");
            break;
        case EWALD:
-            kFindBlockBoundsPeriodic_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
-            LAUNCHERROR("kFindBlockBoundsPeriodic");
-            kFindBlocksWithInteractionsPeriodic_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
-            LAUNCHERROR("kFindBlocksWithInteractionsPeriodic");
-            compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount);
-            kFindInteractionsWithinBlocksPeriodic_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                    sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
-            LAUNCHERROR("kFindInteractionsWithinBlocksPeriodic");
-            if (gpu->bOutputBufferPerWarp)
-                kCalculateCDLJEwaldByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                        (sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
-            else
-                kCalculateCDLJEwaldForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                        (sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
-            LAUNCHERROR("kCalculateCDLJEwaldForces");
-            // Ewald summation
-            kCalculateEwaldFastCosSinSums_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block>>>();
-            LAUNCHERROR("kCalculateEwaldFastCosSinSums");
-            kCalculateEwaldFastForces_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
-            LAUNCHERROR("kCalculateEwaldFastForces");
-            break;
        case PARTICLE_MESH_EWALD:
            kFindBlockBoundsPeriodic_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
            LAUNCHERROR("kFindBlockBoundsPeriodic");
@@ -197,6 +187,8 @@ void kCalculateCDLJForces(gpuContext gpu)
            kFindInteractionsWithinBlocksPeriodic_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
                    sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
            LAUNCHERROR("kFindInteractionsWithinBlocksPeriodic");
+            cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+            cudaBindTexture(NULL, &tabulatedErfcRef, gpu->psTabulatedErfc->_pDevData, &channelDesc, gpu->psTabulatedErfc->_length*sizeof(float));
            if (gpu->bOutputBufferPerWarp)
                kCalculateCDLJEwaldByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
                        (sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
@@ -204,6 +196,14 @@ void kCalculateCDLJForces(gpuContext gpu)
                kCalculateCDLJEwaldForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
                        (sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
            LAUNCHERROR("kCalculateCDLJEwaldForces");
-            kCalculatePME(gpu);
+            if (gpu->sim.nonbondedMethod == EWALD)
+            {
+                kCalculateEwaldFastCosSinSums_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block>>>();
+                LAUNCHERROR("kCalculateEwaldFastCosSinSums");
+                kCalculateEwaldFastForces_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
+                LAUNCHERROR("kCalculateEwaldFastForces");
+            }
+            else
+                kCalculatePME(gpu);
    }
 }
--- a/platforms/cuda/src/kernels/kCalculateCDLJForces.h
+++ b/platforms/cuda/src/kernels/kCalculateCDLJForces.h
@@ -118,7 +118,7 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni
    #ifdef USE_EWALD
                    float r         = sqrt(r2);
                    float alphaR    = cSim.alphaEwald * r;
-                    float erfcAlphaR = erfc(alphaR);
+                    float erfcAlphaR = fastErfc(alphaR);
                    dEdR           += apos.w * psA[j].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI );
 		    /* E */
                    CDLJ_energy    += apos.w * psA[j].q * invR * erfcAlphaR;
@@ -180,7 +180,7 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni
    #ifdef USE_EWALD
                    float r         = sqrt(r2);
                    float alphaR    = cSim.alphaEwald * r;
-                    float erfcAlphaR = erfc(alphaR);
+                    float erfcAlphaR = fastErfc(alphaR);
                    dEdR           += apos.w * psA[j].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
 		    /* E */
 		    CDLJ_energy    += apos.w * psA[j].q * invR * erfcAlphaR;
@@ -303,7 +303,7 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni
    #ifdef USE_EWALD
                        float r         = sqrt(r2);
                        float alphaR    = cSim.alphaEwald * r;
-                        float erfcAlphaR = erfc(alphaR);
+                        float erfcAlphaR = fastErfc(alphaR);
                        dEdR           += apos.w * psA[tj].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
                        /* E */
                        CDLJ_energy    += apos.w * psA[tj].q * invR * erfcAlphaR;
@@ -371,7 +371,7 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni
    #ifdef USE_EWALD
                            float r         = sqrt(r2);
                            float alphaR    = cSim.alphaEwald * r;
-                            float erfcAlphaR = erfc(alphaR);
+                            float erfcAlphaR = fastErfc(alphaR);
                            dEdR           += apos.w * psA[j].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
                            CDLJ_energy    += apos.w * psA[j].q * invR * erfcAlphaR;
    #else
@@ -474,7 +474,7 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni
    #ifdef USE_EWALD
                    float r         = sqrt(r2);
                    float alphaR    = cSim.alphaEwald * r;
-                    float erfcAlphaR = erfc(alphaR);
+                    float erfcAlphaR = fastErfc(alphaR);
                    dEdR           += apos.w * psA[tj].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
                    /* E */
                    CDLJ_energy    += apos.w * psA[tj].q * invR * erfcAlphaR;

--- a/platforms/cuda/src/kernels/kCalculateCDLJObcGbsaForces1.cu
+++ b/platforms/cuda/src/kernels/kCalculateCDLJObcGbsaForces1.cu
@@ -63,6 +63,16 @@ void GetCalculateCDLJObcGbsaForces1Sim(gpuContext gpu)
    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));     
    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
 }
+texture<float, 1, cudaReadModeElementType> tabulatedErfcRef;
+__device__ float fastErfc(float r)
+{
+    float normalized = cSim.tabulatedErfcScale*r;
+    int index = (int) normalized;
+    float fract2 = normalized-index;
+    float fract1 = 1.0f-fract2;
+    return fract1*tex1Dfetch(tabulatedErfcRef, index) + fract2*tex1Dfetch(tabulatedErfcRef, index+1);
+}
 // Include versions of the kernel for N^2 calculations.
@@ -186,6 +196,7 @@ void kCalculateCDLJObcGbsaForces1(gpuContext gpu)
            LAUNCHERROR("kCalculateCDLJObcGbsaPeriodicForces1");
            break;
        case EWALD:
+        case PARTICLE_MESH_EWALD:
            kFindBlockBoundsPeriodic_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
            LAUNCHERROR("kFindBlockBoundsPeriodic");
            kFindBlocksWithInteractionsPeriodic_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
@@ -199,6 +210,8 @@ void kCalculateCDLJObcGbsaForces1(gpuContext gpu)
                kCalculateObcGbsaBornSum(gpu);
                kReduceObcGbsaBornSum(gpu);
            }
+            cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+            cudaBindTexture(NULL, &tabulatedErfcRef, gpu->psTabulatedErfc->_pDevData, &channelDesc, gpu->psTabulatedErfc->_length*sizeof(float));
            if (gpu->bOutputBufferPerWarp)
                kCalculateCDLJObcGbsaEwaldByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
                        (sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
@@ -206,33 +219,15 @@ void kCalculateCDLJObcGbsaForces1(gpuContext gpu)
                kCalculateCDLJObcGbsaEwaldForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
                        (sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
            LAUNCHERROR("kCalculateCDLJObcGbsaEwaldForces");
-            // Ewald summation
+            if (gpu->sim.nonbondedMethod == EWALD)
-            kCalculateEwaldFastCosSinSums_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block>>>();
-            LAUNCHERROR("kCalculateEwaldFastCosSinSums");
-            kCalculateEwaldFastForces_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
-            LAUNCHERROR("kCalculateEwaldFastForces");
-            break;
-        case PARTICLE_MESH_EWALD:
-            kFindBlockBoundsPeriodic_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
-            LAUNCHERROR("kFindBlockBoundsPeriodic");
-            kFindBlocksWithInteractionsPeriodic_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
-            LAUNCHERROR("kFindBlocksWithInteractionsPeriodic");
-            compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount);
-            kFindInteractionsWithinBlocksPeriodic_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                    sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
-            LAUNCHERROR("kFindInteractionsWithinBlocksPeriodic");
-            if (gpu->bRecalculateBornRadii)
            {
-                kCalculateObcGbsaBornSum(gpu);
+                // Ewald summation
-                kReduceObcGbsaBornSum(gpu);
+                kCalculateEwaldFastCosSinSums_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block>>>();
+                LAUNCHERROR("kCalculateEwaldFastCosSinSums");
+                kCalculateEwaldFastForces_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
+                LAUNCHERROR("kCalculateEwaldFastForces");
            }
-            if (gpu->bOutputBufferPerWarp)
-                kCalculateCDLJObcGbsaEwaldByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                        (sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
            else
-                kCalculateCDLJObcGbsaEwaldForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                kCalculatePME(gpu);
-                        (sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
-            LAUNCHERROR("kCalculateCDLJObcGbsaEwaldForces");
-            kCalculatePME(gpu);
    }
 }
--- a/platforms/cuda/src/kernels/kCalculateCDLJObcGbsaForces1.h
+++ b/platforms/cuda/src/kernels/kCalculateCDLJObcGbsaForces1.h
@@ -113,7 +113,7 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int*
    #ifdef USE_EWALD
                    float r                 = sqrt(r2);
                    float alphaR            = cSim.alphaEwald * r;
-                    float erfcAlphaR        = erfc(alphaR);
+                    float erfcAlphaR        = fastErfc(alphaR);
                    dEdR                   += apos.w * psA[j].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
 		    /* E */
                    CDLJObcGbsa_energy     += apos.w * psA[j].q * invR * erfcAlphaR;
@@ -195,7 +195,7 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int*
    #ifdef USE_EWALD
                    float r                 = sqrt(r2);
                    float alphaR            = cSim.alphaEwald * r;
-                    float erfcAlphaR        = erfc(alphaR);
+                    float erfcAlphaR        = fastErfc(alphaR);
                    dEdR                   += apos.w * psA[j].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
 		    /* E */
                    CDLJObcGbsa_energy     += apos.w * psA[j].q * invR * erfcAlphaR;
@@ -350,7 +350,7 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int*
    #ifdef USE_EWALD
                        float r                 = sqrt(r2);
                        float alphaR            = cSim.alphaEwald * r;
-                        float erfcAlphaR        = erfc(alphaR);
+                        float erfcAlphaR        = fastErfc(alphaR);
                        dEdR                   += apos.w * psA[tj].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
                        /* E */
                        CDLJObcGbsa_energy     += apos.w * psA[tj].q * invR * erfcAlphaR;
@@ -439,7 +439,7 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int*
    #ifdef USE_EWALD
                            float r                 = sqrt(r2);
                            float alphaR            = cSim.alphaEwald * r;
-                            float erfcAlphaR        = erfc(alphaR);
+                            float erfcAlphaR        = fastErfc(alphaR);
                            dEdR                   += apos.w * psA[j].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
                            CDLJObcGbsa_energy     += apos.w * psA[j].q * invR * erfcAlphaR;
    #else
@@ -572,7 +572,7 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int*
    #ifdef USE_EWALD
                    float r                 = sqrt(r2);
                    float alphaR            = cSim.alphaEwald * r;
-                    float erfcAlphaR        = erfc(alphaR);
+                    float erfcAlphaR        = fastErfc(alphaR);
                    dEdR                   += apos.w * psA[tj].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
                    /* E */
                    CDLJObcGbsa_energy     += apos.w * psA[tj].q * invR * erfcAlphaR;

--- a/platforms/cuda/src/kernels/kCalculatePME.cu
+++ b/platforms/cuda/src/kernels/kCalculatePME.cu
@@ -49,6 +49,8 @@ void GetCalculatePMESim(gpuContext gpu)
    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
 }
+texture<float4, 1, cudaReadModeElementType> bsplineThetaRef;
 inline __host__ __device__ int fast_mod(int a, int b)
 {
    return (b & (b - 1)) ? a % b : a & (b - 1);
@@ -112,48 +114,6 @@ __global__ void kUpdateGridIndexAndFraction_kernel()
        cSim.pPmeParticleIndex[i] = itmp;
        cSim.pPmeAtomGridIndex[i] = make_float2(i, itmp.x*cSim.pmeGridSize.y*cSim.pmeGridSize.z+itmp.y*cSim.pmeGridSize.z+itmp.z);
    }
-//
-//    // Compute flags for which atoms affect which groups of grid points.
-//
-//    const int3 numGroups = make_int3((cSim.pmeGridSize.x+cSim.pmeGroupSize.x-1)/cSim.pmeGroupSize.x, (cSim.pmeGridSize.y+cSim.pmeGroupSize.y-1)/cSim.pmeGroupSize.y, (cSim.pmeGridSize.z+cSim.pmeGroupSize.z-1)/cSim.pmeGroupSize.z);
-//    const unsigned int totalGroups = numGroups.x*numGroups.y*numGroups.z;
-//    const float3 gridScale = make_float3(cSim.pmeGridSize.x/cSim.periodicBoxSizeX, cSim.pmeGridSize.y/cSim.periodicBoxSizeY, cSim.pmeGridSize.z/cSim.periodicBoxSizeZ);
-//    for (int group = tid; group < totalGroups; group += tnb)
-//    {
-//        int3 gridBase;
-//        gridBase.x = group/(numGroups.y*numGroups.z);
-//        int remainder = group-gridBase.x*numGroups.y*numGroups.z;
-//        gridBase.y = remainder/numGroups.z;
-//        gridBase.z = remainder-gridBase.y*numGroups.z;
-//        gridBase.x *= cSim.pmeGroupSize.x;
-//        gridBase.y *= cSim.pmeGroupSize.y;
-//        gridBase.z *= cSim.pmeGroupSize.z;
-//        unsigned int flags = 0;
-//        unsigned int baseIndex = group*(cSim.paddedNumberOfAtoms/32);
-//        for (int atomBlock = 0; atomBlock < cSim.paddedNumberOfAtoms>>GRIDBITS; atomBlock++)
-//        {
-//            // Decide if this block actually needs to be processed.
-//
-//            int flagIndex = atomBlock%32;
-//            if (flagIndex == 0)
-//                flags = 0;
-//            float4 boxSize = cSim.pGridBoundingBox[atomBlock];
-//            float4 center = cSim.pGridCenter[atomBlock];
-//            int maxx = (int) ceil((center.x+boxSize.x)*gridScale.x)+cSim.pmeGroupSize.x+PME_ORDER;
-//            int maxy = (int) ceil((center.y+boxSize.y)*gridScale.y)+cSim.pmeGroupSize.y+PME_ORDER;
-//            int maxz = (int) ceil((center.z+boxSize.z)*gridScale.z)+cSim.pmeGroupSize.z+PME_ORDER;
-//            int minx = (int) floor((center.x-boxSize.x)*gridScale.x);
-//            int miny = (int) floor((center.y-boxSize.y)*gridScale.y);
-//            int minz = (int) floor((center.z-boxSize.z)*gridScale.z);
-//            int x = minx+(gridBase.x-minx)%cSim.pmeGridSize.x;
-//            int y = miny+(gridBase.y-miny)%cSim.pmeGridSize.y;
-//            int z = minz+(gridBase.z-minz)%cSim.pmeGridSize.z;
-//            if (maxx < x || maxy < y || maxz < z)
-//                flags += 1<<flagIndex;
-//            if (flagIndex == 31 || atomBlock == cSim.paddedNumberOfAtoms>>GRIDBITS)
-//                cSim.pPmeInteractionFlags[baseIndex+atomBlock/32] = flags;
-//        }
-//    }
 }
 /**
@@ -259,82 +219,6 @@ __global__ void kUpdateBsplines_kernel()
    }
 }
-//__global__ void kGridSpreadCharge_kernel()
-//{
-//    extern __shared__ float atomCharge[];
-//    int4* atomGridIndex = (int4*) &atomCharge[blockDim.x];
-//    const unsigned int totalWarps = gridDim.x*blockDim.x/GRID;
-//    const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
-//    const int3 numGroups = make_int3((cSim.pmeGridSize.x+cSim.pmeGroupSize.x-1)/cSim.pmeGroupSize.x, (cSim.pmeGridSize.y+cSim.pmeGroupSize.y-1)/cSim.pmeGroupSize.y, (cSim.pmeGridSize.z+cSim.pmeGroupSize.z-1)/cSim.pmeGroupSize.z);
-//    const unsigned int totalGroups = numGroups.x*numGroups.y*numGroups.z;
-//    unsigned int group = warp*totalGroups/totalWarps;
-//    const unsigned int end = (warp+1)*totalGroups/totalWarps;
-//    const unsigned int index = threadIdx.x & (GRID - 1);
-//
-//    while (group < end)
-//    {
-//        // Process a group of grid points of size cSim.pmeGroupSize.  First figure out the base index for the group,
-//        // and the index of the specific point this thread will handle.
-//
-//        int3 gridBase;
-//        gridBase.x = group/(numGroups.y*numGroups.z);
-//        int remainder = group-gridBase.x*numGroups.y*numGroups.z;
-//        gridBase.y = remainder/numGroups.z;
-//        gridBase.z = remainder-gridBase.y*numGroups.z;
-//        gridBase.x *= cSim.pmeGroupSize.x;
-//        gridBase.y *= cSim.pmeGroupSize.y;
-//        gridBase.z *= cSim.pmeGroupSize.z;
-//        int3 gridPoint;
-//        gridPoint.x = index/(cSim.pmeGroupSize.y*cSim.pmeGroupSize.z);
-//        remainder = index-gridPoint.x*cSim.pmeGroupSize.y*cSim.pmeGroupSize.z;
-//        gridPoint.y = remainder/cSim.pmeGroupSize.z;
-//        gridPoint.z = remainder-gridPoint.y*cSim.pmeGroupSize.z;
-//        gridPoint.x += gridBase.x;
-//        gridPoint.y += gridBase.y;
-//        gridPoint.z += gridBase.z;
-//
-//        // Loop over blocks of atoms.
-//
-//        float result = 0.0f;
-//        int flags = 0;
-//        unsigned int baseIndex = group*(cSim.paddedNumberOfAtoms/32);
-//        for (int atomBlock = 0; atomBlock < cSim.paddedNumberOfAtoms>>GRIDBITS; atomBlock++)
-//        {
-//            // Decide if this block actually needs to be processed.
-//
-//            int flagIndex = atomBlock%32;
-//            if (flagIndex == 0)
-//                flags = cSim.pPmeInteractionFlags[baseIndex+atomBlock/32];
-//            if ((flags & (1<<flagIndex)) != 0)
-//                continue;
-//            int atomIndex = (atomBlock<<GRIDBITS)+index;
-//            if (atomIndex < cSim.atoms)
-//            {
-//                atomCharge[threadIdx.x] = cSim.pPosq[atomIndex].w;
-//                atomGridIndex[threadIdx.x] = cSim.pPmeParticleIndex[atomIndex];
-//            }
-//            int maxAtoms = min(GRID, cSim.atoms-(atomBlock<<GRIDBITS));
-//            for (int i = 0; i < maxAtoms; i++)
-//            {
-//                int localIndex = threadIdx.x-index+i;
-//                int atomIndex = (atomBlock<<GRIDBITS)+i;
-//                int ix = gridPoint.x-atomGridIndex[localIndex].x;
-//                int iy = gridPoint.y-atomGridIndex[localIndex].y;
-//                int iz = gridPoint.z-atomGridIndex[localIndex].z;
-//                ix += (ix < 0 ? cSim.pmeGridSize.x : 0);
-//                iy += (iy < 0 ? cSim.pmeGridSize.y : 0);
-//                iz += (iz < 0 ? cSim.pmeGridSize.z : 0);
-//                if (ix < PME_ORDER && iy < PME_ORDER && iz < PME_ORDER)
-//                    result += atomCharge[threadIdx.x-index+i]*cSim.pPmeBsplineTheta[atomIndex+ix*cSim.atoms].x*cSim.pPmeBsplineTheta[atomIndex+iy*cSim.atoms].y*cSim.pPmeBsplineTheta[atomIndex+iz*cSim.atoms].z;
-//            }
-//        }
-//        unsigned int gridIndex = gridPoint.x*cSim.pmeGridSize.y*cSim.pmeGridSize.z+gridPoint.y*cSim.pmeGridSize.z+gridPoint.z;
-//        if (gridIndex < cSim.pmeGridSize.x*cSim.pmeGridSize.y*cSim.pmeGridSize.z)
-//            cSim.pPmeGrid[gridIndex] = make_cuComplex(result*sqrt(cSim.epsfac), 0.0f);
-//        group++;
-//    }
-//}
 __global__ void kGridSpreadCharge_kernel()
 {
    unsigned int numGridPoints = cSim.pmeGridSize.x*cSim.pmeGridSize.y*cSim.pmeGridSize.z;
@@ -365,7 +249,7 @@ __global__ void kGridSpreadCharge_kernel()
                        float2 atomData = cSim.pPmeAtomGridIndex[i];
                        int atomIndex = atomData.x;
                        float atomCharge = atomData.y;
-                        result += atomCharge*cSim.pPmeBsplineTheta[atomIndex+ix*cSim.atoms].x*cSim.pPmeBsplineTheta[atomIndex+iy*cSim.atoms].y*cSim.pPmeBsplineTheta[atomIndex+iz*cSim.atoms].z;
+                        result += atomCharge*tex1Dfetch(bsplineThetaRef, atomIndex+ix*cSim.atoms).x*tex1Dfetch(bsplineThetaRef, atomIndex+iy*cSim.atoms).y*tex1Dfetch(bsplineThetaRef, atomIndex+iz*cSim.atoms).z;
                    }
                }
        cSim.pPmeGrid[gridIndex] = make_cuComplex(result*sqrt(cSim.epsfac), 0.0f);
@@ -447,6 +331,8 @@ __global__ void kGridInterpolateForce_kernel()
 void kCalculatePME(gpuContext gpu)
 {
 //    printf("kCalculatePME\n");
+    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float4>();
+    cudaBindTexture(NULL, &bsplineThetaRef, gpu->psPmeBsplineTheta->_pDevData, &channelDesc, gpu->psPmeBsplineTheta->_length*sizeof(float4));
    kUpdateGridIndexAndFraction_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
    LAUNCHERROR("kUpdateGridIndexAndFraction");
    bbSort(gpu->psPmeAtomGridIndex->_pDevData, gpu->natoms);
@@ -455,11 +341,8 @@ void kCalculatePME(gpuContext gpu)
    unsigned int threads = 16380/(2*PME_ORDER*sizeof(float4));
    kUpdateBsplines_kernel<<<gpu->sim.blocks, threads, 2*threads*PME_ORDER*sizeof(float4)>>>();
    LAUNCHERROR("kUpdateBsplines");
-    kGridSpreadCharge_kernel<<<gpu->sim.blocks, 64, 64*(sizeof(float)+sizeof(int4))>>>();
+    kGridSpreadCharge_kernel<<<8*gpu->sim.blocks, 64, 64*(sizeof(float)+sizeof(int4))>>>();
    LAUNCHERROR("kGridSpreadCharge");
-//    gpu->psPmeGrid->Download();
-//    for (int i = 0; i < gpu->psPmeGrid->_length; i += 100)
-//        printf("%d %f %f\n", i, (*gpu->psPmeGrid)[i].x, (*gpu->psPmeGrid)[i].y);
    cufftExecC2C(gpu->fftplan, gpu->psPmeGrid->_pDevData, gpu->psPmeGrid->_pDevData, CUFFT_FORWARD);
    kReciprocalConvolution_kernel<<<gpu->sim.blocks, gpu->sim.nonbond_threads_per_block>>>();
    LAUNCHERROR("kReciprocalConvolution");