Minor optimizations to CUDA kernels (#2861)

938afab0 · peastman · GitHub · 6ca68730 · 938afab0 · 938afab0
Unverified Commit 938afab0 authored Sep 24, 2020 by peastman Committed by GitHub Sep 24, 2020
Hide whitespace changes
Inline Side-by-side

Showing with 3 additions and 3 deletions

platforms/cuda/src/CudaKernels.cpp platforms/cuda/src/CudaKernels.cpp +1 -1

platforms/cuda/src/kernels/pme.cu platforms/cuda/src/kernels/pme.cu +2 -2

No files found.
--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
@@ -843,7 +843,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
                pmeInterpolateForceKernel = cu.getKernel(module, "gridInterpolateForce");
                pmeEvalEnergyKernel = cu.getKernel(module, "gridEvaluateEnergy");
                pmeFinishSpreadChargeKernel = cu.getKernel(module, "finishSpreadCharge");
-                cuFuncSetCacheConfig(pmeSpreadChargeKernel, CU_FUNC_CACHE_PREFER_L1);
+                cuFuncSetCacheConfig(pmeSpreadChargeKernel, CU_FUNC_CACHE_PREFER_SHARED);
                cuFuncSetCacheConfig(pmeInterpolateForceKernel, CU_FUNC_CACHE_PREFER_L1);
                if (doLJPME) {
                    pmeDefines["EWALD_ALPHA"] = cu.doubleToString(dispersionAlpha);

--- a/platforms/cuda/src/kernels/pme.cu
+++ b/platforms/cuda/src/kernels/pme.cu
@@ -56,8 +56,6 @@ extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, real
 #else
        const real charge = (CHARGE)*EPSILON_FACTOR;
 #endif
-        if (charge == 0)
-            continue;
        APPLY_PERIODIC_TO_POS(pos)
        real3 t = make_real3(pos.x*recipBoxVecX.x+pos.y*recipBoxVecY.x+pos.z*recipBoxVecZ.x,
                             pos.y*recipBoxVecY.y+pos.z*recipBoxVecZ.y,
@@ -68,6 +66,8 @@ extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, real
        int3 gridIndex = make_int3(((int) t.x) % GRID_SIZE_X,
                                   ((int) t.y) % GRID_SIZE_Y,
                                   ((int) t.z) % GRID_SIZE_Z);
+        if (charge == 0)
+            continue;

        // Since we need the full set of thetas, it's faster to compute them here than load them
        // from global memory.