Made OpenCLContext::reduceBuffer additionally store the result into a 64-bit...

Made OpenCLContext::reduceBuffer additionally store the result into a 64-bit fixed point buffer. (#3209)

Made OpenCLContext::reduceBuffer additionally store the result into a 64-bit...
Made OpenCLContext::reduceBuffer additionally store the result into a 64-bit fixed point buffer. (#3209)
94057ffb · Dann239 · GitHub · 6bdaa495 · 94057ffb · 94057ffb
Unverified Commit 94057ffb authored Aug 13, 2021 by Dann239 Committed by GitHub Aug 13, 2021
4 changed files
--- a/platforms/opencl/include/OpenCLContext.h
+++ b/platforms/opencl/include/OpenCLContext.h
@@ -364,11 +364,13 @@ public:
    /**
     * Given a collection of floating point buffers packed into an array, sum them and store
     * the sum in the first buffer.
+     * Also, write the result into a 64-bit fixed point buffer (overwriting its contents).
     *
     * @param array       the array containing the buffers to reduce
+     * @param longBuffer  the 64-bit fixed point buffer to write the result into
     * @param numBuffers  the number of buffers packed into the array
     */
-    void reduceBuffer(OpenCLArray& array, int numBuffers);
+    void reduceBuffer(OpenCLArray& array, OpenCLArray& longBuffer, int numBuffers);
    /**
     * Sum the buffers containing forces.
     */

--- a/platforms/opencl/src/OpenCLContext.cpp
+++ b/platforms/opencl/src/OpenCLContext.cpp
@@ -782,11 +782,12 @@ void OpenCLContext::reduceForces() {
    executeKernel(reduceForcesKernel, paddedNumAtoms, 128);
 }
-void OpenCLContext::reduceBuffer(OpenCLArray& array, int numBuffers) {
+void OpenCLContext::reduceBuffer(OpenCLArray& array, OpenCLArray& longBuffer, int numBuffers) {
    int bufferSize = array.getSize()/numBuffers;
    reduceReal4Kernel.setArg<cl::Buffer>(0, array.getDeviceBuffer());
-    reduceReal4Kernel.setArg<cl_int>(1, bufferSize);
+    reduceReal4Kernel.setArg<cl::Buffer>(1, longBuffer.getDeviceBuffer());
-    reduceReal4Kernel.setArg<cl_int>(2, numBuffers);
+    reduceReal4Kernel.setArg<cl_int>(2, bufferSize);
+    reduceReal4Kernel.setArg<cl_int>(3, numBuffers);
    executeKernel(reduceReal4Kernel, bufferSize, 128);
 }

--- a/platforms/opencl/src/OpenCLParallelKernels.cpp
+++ b/platforms/opencl/src/OpenCLParallelKernels.cpp
@@ -179,7 +179,7 @@ double OpenCLParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& c
        int elementSize = (cl.getUseDoublePrecision() ? sizeof(mm_double4) : sizeof(mm_float4));
        cl.getQueue().enqueueWriteBuffer(contextForces.getDeviceBuffer(), CL_FALSE, numAtoms*elementSize,
                numAtoms*(data.contexts.size()-1)*elementSize, pinnedForceMemory);
-        cl.reduceBuffer(contextForces, data.contexts.size());
+        cl.reduceBuffer(contextForces, cl.getLongForceBuffer(), data.contexts.size());
        // Balance work between the contexts by transferring a little nonbonded work from the context that
        // finished last to the one that finished first.

--- a/platforms/opencl/src/kernels/utilities.cl
+++ b/platforms/opencl/src/kernels/utilities.cl
@@ -67,9 +67,10 @@ __kernel void clearSixBuffers(__global int* restrict buffer1, int size1, __globa
 /**
 * Sum a collection of buffers into the first one.
+ * Also, write the result into a 64-bit fixed point buffer (overwriting its contents).
 */
-__kernel void reduceReal4Buffer(__global real4* restrict buffer, int bufferSize, int numBuffers) {
+__kernel void reduceReal4Buffer(__global real4* restrict buffer, __global long* restrict longBuffer, int bufferSize, int numBuffers) {
    int index = get_global_id(0);
    int totalSize = bufferSize*numBuffers;
    while (index < bufferSize) {
@@ -77,6 +78,9 @@ __kernel void reduceReal4Buffer(__global real4* restrict buffer, int bufferSize,
        for (int i = index+bufferSize; i < totalSize; i += bufferSize)
            sum += buffer[i];
        buffer[index] = sum;
+        longBuffer[index] = (long) (sum.x*0x100000000);
+        longBuffer[index+bufferSize] = (long) (sum.y*0x100000000);
+        longBuffer[index+2*bufferSize] = (long) (sum.z*0x100000000);
        index += get_global_size(0);
    }
 }