Optimizations to RMSDForce

56b199be · Peter Eastman · f4dc3110 · 56b199be · 56b199be · 56b199be
Commit 56b199be authored Jan 24, 2018 by Peter Eastman
4 changed files
--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
@@ -6887,7 +6887,7 @@ double CudaCalcRMSDForceKernel::executeImpl(ContextImpl& context) {
    // Execute the first kernel.
    int numParticles = particles->getSize();
-    int blockSize = 128;
+    int blockSize = 256;
    void* args1[] = {&numParticles, &cu.getPosq().getDevicePointer(), &referencePos->getDevicePointer(),
            &particles->getDevicePointer(), &buffer->getDevicePointer()};
    cu.executeKernel(kernel1, args1, blockSize, blockSize, blockSize*sizeof(REAL));

--- a/platforms/cuda/src/kernels/rmsd.cu
+++ b/platforms/cuda/src/kernels/rmsd.cu
@@ -4,11 +4,16 @@
 /**
 * Sum a value over all threads.
 */
-__device__ real reduceValue(real value, real* temp) {
+__device__ real reduceValue(real value, volatile real* temp) {
    const int thread = threadIdx.x;
    temp[thread] = value;
    __syncthreads();
-    for (uint step = 1; step < blockDim.x; step *= 2) {
+    for (uint step = 1; step < 32; step *= 2) {
+        if (thread+step < blockDim.x && thread%(2*step) == 0)
+            temp[thread] = temp[thread] + temp[thread+step];
+        SYNC_WARPS
+    }
+    for (uint step = 32; step < blockDim.x; step *= 2) {
        if (thread+step < blockDim.x && thread%(2*step) == 0)
            temp[thread] = temp[thread] + temp[thread+step];
        __syncthreads();
@@ -21,7 +26,7 @@ __device__ real reduceValue(real value, real* temp) {
 */
 extern "C" __global__ void computeRMSDPart1(int numParticles, const real4* __restrict__ posq, const real4* __restrict__ referencePos,
         const int* __restrict__ particles, real* buffer) {
-    extern __shared__ real temp[];
+    extern __shared__ volatile real temp[];
    // Compute the center of the particle positions.

--- a/platforms/opencl/src/OpenCLKernels.cpp
+++ b/platforms/opencl/src/OpenCLKernels.cpp
@@ -7167,7 +7167,7 @@ double OpenCLCalcRMSDForceKernel::executeImpl(ContextImpl& context) {
    // Execute the first kernel.
    int numParticles = particles->getSize();
-    int blockSize = 128;
+    int blockSize = min(256, (int) kernel1.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(cl.getDevice()));
    kernel1.setArg<cl_int>(0, numParticles);
    kernel1.setArg<cl::Buffer>(1, cl.getPosq().getDeviceBuffer());
    kernel1.setArg<cl::Buffer>(2, referencePos->getDeviceBuffer());

--- a/platforms/opencl/src/kernels/rmsd.cl
+++ b/platforms/opencl/src/kernels/rmsd.cl
@@ -4,11 +4,16 @@
 /**
 * Sum a value over all threads.
 */
-real reduceValue(real value, __local real* temp) {
+real reduceValue(real value, __local volatile real* temp) {
    const int thread = get_local_id(0);
    temp[thread] = value;
    barrier(CLK_LOCAL_MEM_FENCE);
-    for (uint step = 1; step < get_local_size(0); step *= 2) {
+    for (uint step = 1; step < 32; step *= 2) {
+        if (thread+step < get_local_size(0) && thread%(2*step) == 0)
+            temp[thread] = temp[thread] + temp[thread+step];
+        SYNC_WARPS
+    }
+    for (uint step = 32; step < get_local_size(0); step *= 2) {
        if (thread+step < get_local_size(0) && thread%(2*step) == 0)
            temp[thread] = temp[thread] + temp[thread+step];
        barrier(CLK_LOCAL_MEM_FENCE);
@@ -20,7 +25,7 @@ real reduceValue(real value, __local real* temp) {
 * Perform the first step of computing the RMSD.  This is executed as a single work group.
 */
 __kernel void computeRMSDPart1(int numParticles, __global const real4* restrict posq, __global const real4* restrict referencePos,
-        __global const int* restrict particles, __global real* buffer, __local real* restrict temp) {
+        __global const int* restrict particles, __global real* buffer, __local volatile real* restrict temp) {
    // Compute the center of the particle positions.
    real3 center = (real3) 0;