Fixed threading bug in PME

066cfd60 · Peter Eastman · d6ad6438 · 066cfd60 · 066cfd60
Commit 066cfd60 authored Apr 07, 2010 by Peter Eastman
Show whitespace changes
Inline Side-by-side

Showing with 12 additions and 6 deletions

platforms/opencl/src/OpenCLKernels.cpp platforms/opencl/src/OpenCLKernels.cpp +1 -0

platforms/opencl/src/kernels/pme.cl platforms/opencl/src/kernels/pme.cl +11 -6

No files found.
--- a/platforms/opencl/src/OpenCLKernels.cpp
+++ b/platforms/opencl/src/OpenCLKernels.cpp
@@ -1288,6 +1288,7 @@ void OpenCLCalcNonbondedForceKernel::executeForces(ContextImpl& context) {
            pmeUpdateBsplinesKernel.setArg<cl::Buffer>(1, pmeBsplineTheta->getDeviceBuffer());
            pmeUpdateBsplinesKernel.setArg<cl::Buffer>(2, pmeBsplineDtheta->getDeviceBuffer());
            pmeUpdateBsplinesKernel.setArg(3, 2*OpenCLContext::ThreadBlockSize*PmeOrder*sizeof(mm_float4), NULL);
+            pmeUpdateBsplinesKernel.setArg<cl::Buffer>(4, pmeAtomGridIndex->getDeviceBuffer());
            pmeSpreadChargeKernel.setArg<cl::Buffer>(0, pmeAtomGridIndex->getDeviceBuffer());
            pmeSpreadChargeKernel.setArg<cl::Buffer>(1, pmeAtomRange->getDeviceBuffer());
            pmeSpreadChargeKernel.setArg<cl::Buffer>(2, pmeGrid->getDeviceBuffer());

--- a/platforms/opencl/src/kernels/pme.cl
+++ b/platforms/opencl/src/kernels/pme.cl
@@ -27,11 +27,6 @@ __kernel void findAtomRangeForGrid(__global float4* posq, __global float2* pmeAt
                pmeAtomRange[j] = i;
            last = gridIndex;
        }
-        // The grid index won't be needed again.  Reuse that component to hold the atom charge, thus saving
-        // an extra load operation in the charge spreading kernel.
-        pmeAtomGridIndex[i].y = posq[(int) atomData.x].w;
    }
    // Fill in values beyond the last atom.
@@ -43,7 +38,7 @@ __kernel void findAtomRangeForGrid(__global float4* posq, __global float2* pmeAt
    }
 }
-__kernel void updateBsplines(__global float4* posq, __global float4* pmeBsplineTheta, __global float4* pmeBsplineDTheta, __local float4* bsplinesCache) {
+__kernel void updateBsplines(__global float4* posq, __global float4* pmeBsplineTheta, __global float4* pmeBsplineDTheta, __local float4* bsplinesCache, __global float2* pmeAtomGridIndex) {
    const float4 scale = 1.0f/(PME_ORDER-1);
    for (int i = get_global_id(0); i < NUM_ATOMS; i += get_global_size(0)) {
        __local float4* data = &bsplinesCache[get_local_id(0)*PME_ORDER];
@@ -79,6 +74,16 @@ __kernel void updateBsplines(__global float4* posq, __global float4* pmeBsplineT
            pmeBsplineDTheta[i+j*NUM_ATOMS] = ddata[j];
        }
    }
+    // The grid index won't be needed again.  Reuse that component to hold the atom charge, thus saving
+    // an extra load operation in the charge spreading kernel.
+    int start = (NUM_ATOMS*get_global_id(0))/get_global_size(0);
+    int end = (NUM_ATOMS*(get_global_id(0)+1))/get_global_size(0);
+    for (int i = start; i < end; ++i) {
+        float2 atomData = pmeAtomGridIndex[i];
+        pmeAtomGridIndex[i].y = posq[(int) atomData.x].w;
+    }
 }
 __kernel void gridSpreadCharge(__global float2* pmeAtomGridIndex, __global int* pmeAtomRange, __global float2* pmeGrid, __global float4* pmeBsplineTheta) {